From d0f0530ca20125d2429ccb5a8fcfa178ae1d03fd Mon Sep 17 00:00:00 2001 From: Harivansh Rathi Date: Sat, 11 Apr 2026 18:54:16 +0000 Subject: [PATCH] feat: power-cycle --- internal/config/config.go | 21 +++ internal/daemon/create.go | 6 + internal/daemon/daemon_test.go | 170 ++++++++++++++++++++- internal/daemon/lifecycle.go | 16 +- internal/daemon/review_regressions_test.go | 23 ++- internal/daemon/snapshot.go | 8 + main.go | 16 ++ 7 files changed, 242 insertions(+), 18 deletions(-) diff --git a/internal/config/config.go b/internal/config/config.go index 510d28e..799cd97 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -6,6 +6,7 @@ import ( "path/filepath" "strconv" "strings" + "time" "github.com/getcompanion-ai/computer-host/internal/firecracker" ) @@ -27,6 +28,7 @@ type Config struct { SocketPath string HTTPAddr string EgressInterface string + ReconcileInterval time.Duration FirecrackerBinaryPath string JailerBinaryPath string GuestLoginCAPublicKey string @@ -67,6 +69,10 @@ func Load() (Config, error) { JailerBinaryPath: strings.TrimSpace(os.Getenv("JAILER_BINARY_PATH")), GuestLoginCAPublicKey: strings.TrimSpace(os.Getenv("GUEST_LOGIN_CA_PUBLIC_KEY")), } + cfg.ReconcileInterval, err = durationDefault("FIRECRACKER_HOST_RECONCILE_INTERVAL", 5*time.Second) + if err != nil { + return Config{}, err + } if err := cfg.Validate(); err != nil { return Config{}, err } @@ -114,6 +120,9 @@ func (c Config) Validate() error { if strings.TrimSpace(c.EgressInterface) == "" { return fmt.Errorf("FIRECRACKER_HOST_EGRESS_INTERFACE is required") } + if c.ReconcileInterval <= 0 { + return fmt.Errorf("FIRECRACKER_HOST_RECONCILE_INTERVAL must be greater than zero") + } return nil } @@ -174,3 +183,15 @@ func loadBool(name string) (bool, error) { } return parsed, nil } + +func durationDefault(name string, fallback time.Duration) (time.Duration, error) { + value := strings.TrimSpace(os.Getenv(name)) + if value == "" { + return fallback, nil + } + parsed, err := time.ParseDuration(value) + if err != nil { + return 0, fmt.Errorf("%s must be a duration: %w", name, err) + } + return parsed, nil +} diff --git a/internal/daemon/create.go b/internal/daemon/create.go index 0bb914d..7777e9c 100644 --- a/internal/daemon/create.go +++ b/internal/daemon/create.go @@ -72,6 +72,12 @@ func (d *Daemon) CreateMachine(ctx context.Context, req contracthost.CreateMachi if err := os.Truncate(systemVolumePath, defaultGuestDiskSizeBytes); err != nil { return nil, fmt.Errorf("expand system volume for %q: %w", req.MachineID, err) } + if err := injectMachineIdentity(ctx, systemVolumePath, req.MachineID); err != nil { + return nil, fmt.Errorf("inject machine identity for %q: %w", req.MachineID, err) + } + if err := injectGuestConfig(ctx, systemVolumePath, guestConfig); err != nil { + return nil, fmt.Errorf("inject guest config for %q: %w", req.MachineID, err) + } removeSystemVolumeOnFailure := true defer func() { if !removeSystemVolumeOnFailure { diff --git a/internal/daemon/daemon_test.go b/internal/daemon/daemon_test.go index 3e2ce6e..d2f9929 100644 --- a/internal/daemon/daemon_test.go +++ b/internal/daemon/daemon_test.go @@ -236,6 +236,23 @@ func TestCreateMachineStagesArtifactsAndPersistsState(t *testing.T) { if machine.GuestConfig == nil || len(machine.GuestConfig.AuthorizedKeys) == 0 { t.Fatalf("stored guest config missing authorized keys: %#v", machine.GuestConfig) } + machineName, err := readExt4File(runtime.lastSpec.RootFSPath, "/etc/microagent/machine-name") + if err != nil { + t.Fatalf("read injected machine-name: %v", err) + } + if machineName != "vm-1\n" { + t.Fatalf("machine-name mismatch: got %q want %q", machineName, "vm-1\n") + } + injectedAuthorizedKeys, err := readExt4File(runtime.lastSpec.RootFSPath, "/etc/microagent/authorized_keys") + if err != nil { + t.Fatalf("read injected authorized_keys: %v", err) + } + if !strings.Contains(injectedAuthorizedKeys, strings.TrimSpace(string(hostAuthorizedKeyBytes))) { + t.Fatalf("disk authorized_keys missing backend ssh key: %q", injectedAuthorizedKeys) + } + if !strings.Contains(injectedAuthorizedKeys, "daemon-test") { + t.Fatalf("disk authorized_keys missing request override key: %q", injectedAuthorizedKeys) + } operations, err := fileStore.ListOperations(context.Background()) if err != nil { @@ -260,6 +277,11 @@ func TestStopMachineSyncsGuestFilesystemBeforeDelete(t *testing.T) { t.Fatalf("create daemon: %v", err) } + var syncedHost string + hostDaemon.syncGuestFilesystem = func(_ context.Context, runtimeHost string) error { + syncedHost = runtimeHost + return nil + } var shutdownHost string hostDaemon.shutdownGuest = func(_ context.Context, runtimeHost string) error { shutdownHost = runtimeHost @@ -295,6 +317,9 @@ func TestStopMachineSyncsGuestFilesystemBeforeDelete(t *testing.T) { if shutdownHost != "172.16.0.2" { t.Fatalf("shutdown host mismatch: got %q want %q", shutdownHost, "172.16.0.2") } + if syncedHost != "172.16.0.2" { + t.Fatalf("sync host mismatch: got %q want %q", syncedHost, "172.16.0.2") + } // runtime.Delete is always called to clean up TAP device and runtime dir. if len(runtime.deleteCalls) != 1 { t.Fatalf("runtime delete call count mismatch: got %d want 1", len(runtime.deleteCalls)) @@ -361,6 +386,10 @@ func TestReconcileStartingMachinePersonalizesBeforeRunning(t *testing.T) { t.Fatalf("create machine: %v", err) } + if err := hostDaemon.Reconcile(context.Background()); err != nil { + t.Fatalf("Reconcile returned error: %v", err) + } + response, err := hostDaemon.GetMachine(context.Background(), "vm-starting") if err != nil { t.Fatalf("GetMachine returned error: %v", err) @@ -376,6 +405,116 @@ func TestReconcileStartingMachinePersonalizesBeforeRunning(t *testing.T) { } } +func TestListMachinesDoesNotReconcileStartingMachines(t *testing.T) { + root := t.TempDir() + cfg := testConfig(root) + fileStore, err := store.NewFileStore(cfg.StatePath, cfg.OperationsPath) + if err != nil { + t.Fatalf("create file store: %v", err) + } + + hostDaemon, err := New(cfg, fileStore, &fakeRuntime{}) + if err != nil { + t.Fatalf("create daemon: %v", err) + } + hostDaemon.personalizeGuest = func(context.Context, *model.MachineRecord, firecracker.MachineState) error { + t.Fatalf("ListMachines should not reconcile guest personalization") + return nil + } + hostDaemon.readGuestSSHPublicKey = func(context.Context, string) (string, error) { + t.Fatalf("ListMachines should not read guest ssh public key") + return "", nil + } + + now := time.Now().UTC() + if err := fileStore.CreateMachine(context.Background(), model.MachineRecord{ + ID: "vm-list", + SystemVolumeID: "vm-list-system", + RuntimeHost: "127.0.0.1", + TapDevice: "fctap-list", + Ports: defaultMachinePorts(), + Phase: contracthost.MachinePhaseStarting, + PID: 4321, + SocketPath: filepath.Join(cfg.RuntimeDir, "machines", "vm-list", "root", "run", "firecracker.sock"), + CreatedAt: now, + StartedAt: &now, + }); err != nil { + t.Fatalf("create machine: %v", err) + } + + response, err := hostDaemon.ListMachines(context.Background()) + if err != nil { + t.Fatalf("ListMachines returned error: %v", err) + } + if len(response.Machines) != 1 { + t.Fatalf("machine count = %d, want 1", len(response.Machines)) + } + if response.Machines[0].Phase != contracthost.MachinePhaseStarting { + t.Fatalf("machine phase = %q, want %q", response.Machines[0].Phase, contracthost.MachinePhaseStarting) + } +} + +func TestReconcileStartingMachineIgnoresPersonalizationFailures(t *testing.T) { + root := t.TempDir() + cfg := testConfig(root) + fileStore, err := store.NewFileStore(cfg.StatePath, cfg.OperationsPath) + if err != nil { + t.Fatalf("create file store: %v", err) + } + + sshListener := listenTestPort(t, int(defaultSSHPort)) + defer func() { _ = sshListener.Close() }() + vncListener := listenTestPort(t, int(defaultVNCPort)) + defer func() { _ = vncListener.Close() }() + + startedAt := time.Unix(1700000201, 0).UTC() + runtime := &fakeRuntime{} + hostDaemon, err := New(cfg, fileStore, runtime) + if err != nil { + t.Fatalf("create daemon: %v", err) + } + hostDaemon.personalizeGuest = func(context.Context, *model.MachineRecord, firecracker.MachineState) error { + return errors.New("vsock EOF") + } + hostDaemon.readGuestSSHPublicKey = func(context.Context, string) (string, error) { + return "", errors.New("Permission denied") + } + + if err := fileStore.CreateMachine(context.Background(), model.MachineRecord{ + ID: "vm-best-effort", + SystemVolumeID: "vm-best-effort-system", + RuntimeHost: "127.0.0.1", + TapDevice: "fctap-best-effort", + Ports: defaultMachinePorts(), + GuestSSHPublicKey: "ssh-ed25519 AAAAExistingHostKey", + Phase: contracthost.MachinePhaseStarting, + PID: 4322, + SocketPath: filepath.Join(cfg.RuntimeDir, "machines", "vm-best-effort", "root", "run", "firecracker.sock"), + CreatedAt: time.Now().UTC(), + StartedAt: &startedAt, + }); err != nil { + t.Fatalf("create machine: %v", err) + } + + if err := hostDaemon.Reconcile(context.Background()); err != nil { + t.Fatalf("Reconcile returned error: %v", err) + } + + record, err := fileStore.GetMachine(context.Background(), "vm-best-effort") + if err != nil { + t.Fatalf("get machine: %v", err) + } + if record.Phase != contracthost.MachinePhaseRunning { + t.Fatalf("machine phase = %q, want %q", record.Phase, contracthost.MachinePhaseRunning) + } + if record.GuestSSHPublicKey != "ssh-ed25519 AAAAExistingHostKey" { + t.Fatalf("guest ssh public key = %q, want preserved value", record.GuestSSHPublicKey) + } + if len(runtime.deleteCalls) != 0 { + t.Fatalf("runtime delete calls = %d, want 0", len(runtime.deleteCalls)) + } +} + func TestNewEnsuresBackendSSHKeyPair(t *testing.T) { root := t.TempDir() cfg := testConfig(root) @@ -436,7 +575,7 @@ func TestRestoreSnapshotFallsBackToLocalSnapshotNetwork(t *testing.T) { server := newRestoreArtifactServer(t, map[string][]byte{ "/kernel": []byte("kernel"), "/rootfs": []byte("rootfs"), - "/system": []byte("disk"), + "/system": buildTestExt4ImageBytes(t), }) defer server.Close() @@ -582,7 +721,7 @@ func TestRestoreSnapshotUsesLocalSnapshotArtifacts(t *testing.T) { t.Fatalf("create snapshot dir: %v", err) } systemPath := filepath.Join(snapshotDir, "system.img") - if err := os.WriteFile(systemPath, []byte("disk"), 0o644); err != nil { + if err := os.WriteFile(systemPath, buildTestExt4ImageBytes(t), 0o644); err != nil { t.Fatalf("write system disk: %v", err) } if err := fileStore.CreateSnapshot(context.Background(), model.SnapshotRecord{ @@ -620,6 +759,13 @@ func TestRestoreSnapshotUsesLocalSnapshotArtifacts(t *testing.T) { if runtime.restoreCalls != 0 { t.Fatalf("restore boot call count mismatch: got %d want 0", runtime.restoreCalls) } + machineName, err := readExt4File(runtime.lastSpec.RootFSPath, "/etc/microagent/machine-name") + if err != nil { + t.Fatalf("read restored machine-name: %v", err) + } + if machineName != "restored-local\n" { + t.Fatalf("restored machine-name mismatch: got %q want %q", machineName, "restored-local\n") + } } func TestGetSnapshotArtifactReturnsLocalArtifactPath(t *testing.T) { @@ -713,7 +859,7 @@ func TestRestoreSnapshotUsesDurableSnapshotSpec(t *testing.T) { server := newRestoreArtifactServer(t, map[string][]byte{ "/kernel": []byte("kernel"), "/rootfs": []byte("rootfs"), - "/system": []byte("disk"), + "/system": buildTestExt4ImageBytes(t), "/user-0": []byte("user-disk"), }) defer server.Close() @@ -820,7 +966,7 @@ func TestRestoreSnapshotBootsWithFreshNetworkWhenSourceNetworkInUseOnHost(t *tes server := newRestoreArtifactServer(t, map[string][]byte{ "/kernel": []byte("kernel"), "/rootfs": []byte("rootfs"), - "/system": []byte("disk"), + "/system": buildTestExt4ImageBytes(t), }) defer server.Close() @@ -946,11 +1092,27 @@ func testConfig(root string) appconfig.Config { DriveIOEngine: firecracker.DriveIOEngineSync, SocketPath: filepath.Join(root, "firecracker-host.sock"), EgressInterface: "eth0", + ReconcileInterval: time.Second, FirecrackerBinaryPath: "/usr/bin/firecracker", JailerBinaryPath: "/usr/bin/jailer", } } +func buildTestExt4ImageBytes(t *testing.T) []byte { + t.Helper() + + root := t.TempDir() + imagePath := filepath.Join(root, "rootfs.ext4") + if err := buildTestExt4Image(root, imagePath); err != nil { + t.Fatalf("build ext4 image: %v", err) + } + payload, err := os.ReadFile(imagePath) + if err != nil { + t.Fatalf("read ext4 image: %v", err) + } + return payload +} + func TestGuestKernelArgsDisablesPCIByDefault(t *testing.T) { t.Parallel() diff --git a/internal/daemon/lifecycle.go b/internal/daemon/lifecycle.go index 42a4708..b96ce7c 100644 --- a/internal/daemon/lifecycle.go +++ b/internal/daemon/lifecycle.go @@ -17,7 +17,7 @@ import ( ) func (d *Daemon) GetMachine(ctx context.Context, id contracthost.MachineID) (*contracthost.GetMachineResponse, error) { - record, err := d.reconcileMachine(ctx, id) + record, err := d.store.GetMachine(ctx, id) if err != nil { return nil, err } @@ -32,11 +32,7 @@ func (d *Daemon) ListMachines(ctx context.Context) (*contracthost.ListMachinesRe machines := make([]contracthost.Machine, 0, len(records)) for _, record := range records { - reconciled, err := d.reconcileMachine(ctx, record.ID) - if err != nil { - return nil, err - } - machines = append(machines, machineToContract(*reconciled)) + machines = append(machines, machineToContract(record)) } return &contracthost.ListMachinesResponse{Machines: machines}, nil } @@ -391,11 +387,12 @@ func (d *Daemon) reconcileMachine(ctx context.Context, machineID contracthost.Ma return record, nil } if err := d.personalizeGuest(ctx, record, *state); err != nil { - return d.failMachineStartup(ctx, record, err.Error()) + fmt.Fprintf(os.Stderr, "warning: guest personalization for %q failed: %v\n", record.ID, err) } guestSSHPublicKey, err := d.readGuestSSHPublicKey(ctx, state.RuntimeHost) if err != nil { - return d.failMachineStartup(ctx, record, err.Error()) + fmt.Fprintf(os.Stderr, "warning: read guest ssh public key for %q failed: %v\n", record.ID, err) + guestSSHPublicKey = record.GuestSSHPublicKey } record.RuntimeHost = state.RuntimeHost record.TapDevice = state.TapName @@ -501,6 +498,9 @@ func (d *Daemon) stopMachineRecord(ctx context.Context, record *model.MachineRec d.stopPublishedPortsForMachine(record.ID) if record.Phase == contracthost.MachinePhaseRunning && strings.TrimSpace(record.RuntimeHost) != "" { + if err := d.syncGuestFilesystem(ctx, record.RuntimeHost); err != nil { + fmt.Fprintf(os.Stderr, "warning: guest filesystem sync for %q failed: %v\n", record.ID, err) + } d.shutdownGuestClean(ctx, record) } // Always call runtime.Delete: it cleans up the TAP device, runtime diff --git a/internal/daemon/review_regressions_test.go b/internal/daemon/review_regressions_test.go index af9edd2..5df2c00 100644 --- a/internal/daemon/review_regressions_test.go +++ b/internal/daemon/review_regressions_test.go @@ -468,8 +468,9 @@ func TestRestoreSnapshotTransitionsToStartingWithoutRelayAllocation(t *testing.T if err := os.MkdirAll(snapDir, 0o755); err != nil { t.Fatalf("create snapshot dir: %v", err) } + systemDisk := buildTestExt4ImageBytes(t) snapDisk := filepath.Join(snapDir, "system.img") - if err := os.WriteFile(snapDisk, []byte("disk"), 0o644); err != nil { + if err := os.WriteFile(snapDisk, systemDisk, 0o644); err != nil { t.Fatalf("write snapshot disk: %v", err) } if err := baseStore.CreateSnapshot(context.Background(), model.SnapshotRecord{ @@ -487,7 +488,7 @@ func TestRestoreSnapshotTransitionsToStartingWithoutRelayAllocation(t *testing.T server := newRestoreArtifactServer(t, map[string][]byte{ "/kernel": []byte("kernel"), "/rootfs": []byte("rootfs"), - "/system": []byte("disk"), + "/system": systemDisk, }) defer server.Close() @@ -594,8 +595,16 @@ func TestStopMachineContinuesWhenGuestSyncFails(t *testing.T) { t.Fatalf("create daemon: %v", err) } stubGuestSSHPublicKeyReader(hostDaemon) + hostDaemon.syncGuestFilesystem = func(context.Context, string) error { + return errors.New("guest sync failed") + } hostDaemon.shutdownGuest = func(context.Context, string) error { - return errors.New("guest shutdown failed") + runtime.inspectOverride = func(state firecracker.MachineState) (*firecracker.MachineState, error) { + state.Phase = firecracker.PhaseStopped + state.PID = 0 + return &state, nil + } + return nil } now := time.Now().UTC() @@ -678,11 +687,12 @@ func TestRestoreSnapshotCleansStagingArtifactsAfterSuccess(t *testing.T) { } stubGuestSSHPublicKeyReader(hostDaemon) hostDaemon.reconfigureGuestIdentity = func(context.Context, string, contracthost.MachineID, *contracthost.GuestConfig) error { return nil } + systemDisk := buildTestExt4ImageBytes(t) server := newRestoreArtifactServer(t, map[string][]byte{ "/kernel": []byte("kernel"), "/rootfs": []byte("rootfs"), - "/system": []byte("disk"), + "/system": systemDisk, }) defer server.Close() @@ -699,7 +709,7 @@ func TestRestoreSnapshotCleansStagingArtifactsAfterSuccess(t *testing.T) { SourceRuntimeHost: "172.16.0.2", SourceTapDevice: "fctap0", Artifacts: []contracthost.SnapshotArtifact{ - {ID: "disk-system", Kind: contracthost.SnapshotArtifactKindDisk, Name: "system.img", DownloadURL: server.URL + "/system", SHA256Hex: mustSHA256Hex(t, []byte("disk"))}, + {ID: "disk-system", Kind: contracthost.SnapshotArtifactKindDisk, Name: "system.img", DownloadURL: server.URL + "/system", SHA256Hex: mustSHA256Hex(t, systemDisk)}, }, }, }) @@ -727,11 +737,12 @@ func TestRestoreSnapshotCleansStagingArtifactsAfterDownloadFailure(t *testing.T) t.Fatalf("create daemon: %v", err) } stubGuestSSHPublicKeyReader(hostDaemon) + systemDisk := buildTestExt4ImageBytes(t) server := newRestoreArtifactServer(t, map[string][]byte{ "/kernel": []byte("kernel"), "/rootfs": []byte("rootfs"), - "/system": []byte("disk"), + "/system": systemDisk, }) defer server.Close() diff --git a/internal/daemon/snapshot.go b/internal/daemon/snapshot.go index 4f04616..1054194 100644 --- a/internal/daemon/snapshot.go +++ b/internal/daemon/snapshot.go @@ -265,6 +265,14 @@ func (d *Daemon) RestoreSnapshot(ctx context.Context, snapshotID contracthost.Sn clearOperation = true return nil, fmt.Errorf("copy system disk for restore: %w", err) } + if err := injectMachineIdentity(ctx, newSystemDiskPath, req.MachineID); err != nil { + clearOperation = true + return nil, fmt.Errorf("inject machine identity for restore: %w", err) + } + if err := injectGuestConfig(ctx, newSystemDiskPath, guestConfig); err != nil { + clearOperation = true + return nil, fmt.Errorf("inject guest config for restore: %w", err) + } type restoredUserVolume struct { ID contracthost.VolumeID diff --git a/main.go b/main.go index 5eca7ea..56e4a6f 100644 --- a/main.go +++ b/main.go @@ -10,6 +10,7 @@ import ( "os/signal" "path/filepath" "syscall" + "time" "golang.org/x/sync/errgroup" @@ -100,6 +101,21 @@ func main() { } return nil }) + group.Go(func() error { + ticker := time.NewTicker(cfg.ReconcileInterval) + defer ticker.Stop() + + for { + select { + case <-groupCtx.Done(): + return nil + case <-ticker.C: + if err := hostDaemon.Reconcile(groupCtx); err != nil && groupCtx.Err() == nil { + fmt.Fprintf(os.Stderr, "warning: firecracker-host reconcile failed: %v\n", err) + } + } + } + }) if err := group.Wait(); err != nil { exit(err)