host daemon (#2)

* feat: host daemon api scaffold

* fix: use sparse writes

* fix: unix socket length (<108 bytes)
This commit is contained in:
Hari 2026-04-08 11:23:19 -04:00 committed by GitHub
parent 4028bb5a1d
commit e2f9e54970
21 changed files with 2111 additions and 372 deletions

221
internal/daemon/create.go Normal file
View file

@ -0,0 +1,221 @@
package daemon
import (
"context"
"fmt"
"os"
"path/filepath"
"time"
"github.com/getcompanion-ai/computer-host/internal/firecracker"
"github.com/getcompanion-ai/computer-host/internal/model"
"github.com/getcompanion-ai/computer-host/internal/store"
contracthost "github.com/getcompanion-ai/computer-host/contract"
)
func (d *Daemon) CreateMachine(ctx context.Context, req contracthost.CreateMachineRequest) (*contracthost.CreateMachineResponse, error) {
if err := validateMachineID(req.MachineID); err != nil {
return nil, err
}
if err := validateArtifactRef(req.Artifact); err != nil {
return nil, err
}
unlock := d.lockMachine(req.MachineID)
defer unlock()
if _, err := d.store.GetMachine(ctx, req.MachineID); err == nil {
return nil, fmt.Errorf("machine %q already exists", req.MachineID)
} else if err != nil && err != store.ErrNotFound {
return nil, err
}
if err := d.store.UpsertOperation(ctx, model.OperationRecord{
MachineID: req.MachineID,
Type: model.MachineOperationCreate,
StartedAt: time.Now().UTC(),
}); err != nil {
return nil, err
}
clearOperation := false
defer func() {
if clearOperation {
_ = d.store.DeleteOperation(context.Background(), req.MachineID)
}
}()
artifact, err := d.ensureArtifact(ctx, req.Artifact)
if err != nil {
return nil, err
}
userVolumes, err := d.loadAttachableUserVolumes(ctx, req.MachineID, req.UserVolumeIDs)
if err != nil {
return nil, err
}
systemVolumePath := d.systemVolumePath(req.MachineID)
if err := os.MkdirAll(filepath.Dir(systemVolumePath), 0o755); err != nil {
return nil, fmt.Errorf("create system volume dir for %q: %w", req.MachineID, err)
}
if err := cloneFile(artifact.RootFSPath, systemVolumePath); err != nil {
return nil, err
}
spec, err := d.buildMachineSpec(req.MachineID, artifact, userVolumes, systemVolumePath)
if err != nil {
return nil, err
}
usedNetworks, err := d.listRunningNetworks(ctx, req.MachineID)
if err != nil {
return nil, err
}
state, err := d.runtime.Boot(ctx, spec, usedNetworks)
if err != nil {
return nil, err
}
now := time.Now().UTC()
systemVolumeRecord := model.VolumeRecord{
ID: d.systemVolumeID(req.MachineID),
Kind: contracthost.VolumeKindSystem,
AttachedMachineID: machineIDPtr(req.MachineID),
SourceArtifact: &req.Artifact,
Pool: model.StoragePoolMachineDisks,
Path: systemVolumePath,
CreatedAt: now,
}
if err := d.store.CreateVolume(ctx, systemVolumeRecord); err != nil {
_ = d.runtime.Delete(context.Background(), *state)
return nil, err
}
attachedUserVolumeIDs := make([]contracthost.VolumeID, 0, len(userVolumes))
for _, volume := range userVolumes {
volume.AttachedMachineID = machineIDPtr(req.MachineID)
if err := d.store.UpdateVolume(ctx, volume); err != nil {
for _, attachedVolumeID := range attachedUserVolumeIDs {
attachedVolume, getErr := d.store.GetVolume(context.Background(), attachedVolumeID)
if getErr == nil {
attachedVolume.AttachedMachineID = nil
_ = d.store.UpdateVolume(context.Background(), *attachedVolume)
}
}
_ = d.store.DeleteVolume(context.Background(), systemVolumeRecord.ID)
_ = d.runtime.Delete(context.Background(), *state)
return nil, err
}
attachedUserVolumeIDs = append(attachedUserVolumeIDs, volume.ID)
}
record := model.MachineRecord{
ID: req.MachineID,
Artifact: req.Artifact,
SystemVolumeID: systemVolumeRecord.ID,
UserVolumeIDs: append([]contracthost.VolumeID(nil), attachedUserVolumeIDs...),
RuntimeHost: state.RuntimeHost,
TapDevice: state.TapName,
Ports: defaultMachinePorts(),
Phase: contracthost.MachinePhaseRunning,
PID: state.PID,
SocketPath: state.SocketPath,
CreatedAt: now,
StartedAt: state.StartedAt,
}
if err := d.store.CreateMachine(ctx, record); err != nil {
for _, volume := range userVolumes {
volume.AttachedMachineID = nil
_ = d.store.UpdateVolume(context.Background(), volume)
}
_ = d.store.DeleteVolume(context.Background(), systemVolumeRecord.ID)
_ = d.runtime.Delete(context.Background(), *state)
return nil, err
}
clearOperation = true
return &contracthost.CreateMachineResponse{Machine: machineToContract(record)}, nil
}
func (d *Daemon) buildMachineSpec(machineID contracthost.MachineID, artifact *model.ArtifactRecord, userVolumes []model.VolumeRecord, systemVolumePath string) (firecracker.MachineSpec, error) {
drives := make([]firecracker.DriveSpec, 0, len(userVolumes))
for i, volume := range userVolumes {
drives = append(drives, firecracker.DriveSpec{
ID: fmt.Sprintf("user-%d", i),
Path: volume.Path,
ReadOnly: false,
})
}
spec := firecracker.MachineSpec{
ID: firecracker.MachineID(machineID),
VCPUs: defaultGuestVCPUs,
MemoryMiB: defaultGuestMemoryMiB,
KernelImagePath: artifact.KernelImagePath,
RootFSPath: systemVolumePath,
KernelArgs: defaultGuestKernelArgs,
Drives: drives,
}
if err := spec.Validate(); err != nil {
return firecracker.MachineSpec{}, err
}
return spec, nil
}
func (d *Daemon) ensureArtifact(ctx context.Context, ref contracthost.ArtifactRef) (*model.ArtifactRecord, error) {
key := artifactKey(ref)
unlock := d.lockArtifact(key)
defer unlock()
if artifact, err := d.store.GetArtifact(ctx, ref); err == nil {
return artifact, nil
} else if err != store.ErrNotFound {
return nil, err
}
dir := filepath.Join(d.config.ArtifactsDir, key)
if err := os.MkdirAll(dir, 0o755); err != nil {
return nil, fmt.Errorf("create artifact dir %q: %w", dir, err)
}
kernelPath := filepath.Join(dir, "kernel")
rootFSPath := filepath.Join(dir, "rootfs")
if err := downloadFile(ctx, ref.KernelImageURL, kernelPath); err != nil {
return nil, err
}
if err := downloadFile(ctx, ref.RootFSURL, rootFSPath); err != nil {
return nil, err
}
artifact := model.ArtifactRecord{
Ref: ref,
LocalKey: key,
LocalDir: dir,
KernelImagePath: kernelPath,
RootFSPath: rootFSPath,
CreatedAt: time.Now().UTC(),
}
if err := d.store.PutArtifact(ctx, artifact); err != nil {
return nil, err
}
return &artifact, nil
}
func (d *Daemon) loadAttachableUserVolumes(ctx context.Context, machineID contracthost.MachineID, volumeIDs []contracthost.VolumeID) ([]model.VolumeRecord, error) {
volumes := make([]model.VolumeRecord, 0, len(volumeIDs))
for _, volumeID := range volumeIDs {
volume, err := d.store.GetVolume(ctx, volumeID)
if err != nil {
return nil, err
}
if volume.Kind != contracthost.VolumeKindUser {
return nil, fmt.Errorf("volume %q is not a user volume", volumeID)
}
if volume.AttachedMachineID != nil && *volume.AttachedMachineID != machineID {
return nil, fmt.Errorf("volume %q is already attached to machine %q", volumeID, *volume.AttachedMachineID)
}
volumes = append(volumes, *volume)
}
return volumes, nil
}

View file

@ -1,12 +1,95 @@
package daemon
import (
"context"
"fmt"
"os"
"sync"
appconfig "github.com/getcompanion-ai/computer-host/internal/config"
"github.com/getcompanion-ai/computer-host/internal/firecracker"
"github.com/getcompanion-ai/computer-host/internal/store"
contracthost "github.com/getcompanion-ai/computer-host/contract"
)
type Runtime interface{}
const (
defaultGuestKernelArgs = "console=ttyS0 reboot=k panic=1 pci=off"
defaultGuestMemoryMiB = int64(512)
defaultGuestVCPUs = int64(1)
defaultSSHPort = uint16(2222)
defaultVNCPort = uint16(6080)
defaultCopyBufferSize = 1024 * 1024
)
type Runtime interface {
Boot(context.Context, firecracker.MachineSpec, []firecracker.NetworkAllocation) (*firecracker.MachineState, error)
Inspect(firecracker.MachineState) (*firecracker.MachineState, error)
Delete(context.Context, firecracker.MachineState) error
}
type Daemon struct {
Store store.Store
Runtime Runtime
config appconfig.Config
store store.Store
runtime Runtime
locksMu sync.Mutex
machineLocks map[contracthost.MachineID]*sync.Mutex
artifactLocks map[string]*sync.Mutex
}
func New(cfg appconfig.Config, store store.Store, runtime Runtime) (*Daemon, error) {
if err := cfg.Validate(); err != nil {
return nil, err
}
if store == nil {
return nil, fmt.Errorf("store is required")
}
if runtime == nil {
return nil, fmt.Errorf("runtime is required")
}
for _, dir := range []string{cfg.ArtifactsDir, cfg.MachineDisksDir, cfg.RuntimeDir} {
if err := os.MkdirAll(dir, 0o755); err != nil {
return nil, fmt.Errorf("create daemon dir %q: %w", dir, err)
}
}
return &Daemon{
config: cfg,
store: store,
runtime: runtime,
machineLocks: make(map[contracthost.MachineID]*sync.Mutex),
artifactLocks: make(map[string]*sync.Mutex),
}, nil
}
func (d *Daemon) Health(ctx context.Context) (*contracthost.HealthResponse, error) {
if _, err := d.store.ListMachines(ctx); err != nil {
return nil, err
}
return &contracthost.HealthResponse{OK: true}, nil
}
func (d *Daemon) lockMachine(machineID contracthost.MachineID) func() {
d.locksMu.Lock()
lock, ok := d.machineLocks[machineID]
if !ok {
lock = &sync.Mutex{}
d.machineLocks[machineID] = lock
}
d.locksMu.Unlock()
lock.Lock()
return lock.Unlock
}
func (d *Daemon) lockArtifact(key string) func() {
d.locksMu.Lock()
lock, ok := d.artifactLocks[key]
if !ok {
lock = &sync.Mutex{}
d.artifactLocks[key] = lock
}
d.locksMu.Unlock()
lock.Lock()
return lock.Unlock
}

View file

@ -0,0 +1,211 @@
package daemon
import (
"context"
"net/http"
"net/http/httptest"
"os"
"path/filepath"
"testing"
"time"
appconfig "github.com/getcompanion-ai/computer-host/internal/config"
"github.com/getcompanion-ai/computer-host/internal/firecracker"
"github.com/getcompanion-ai/computer-host/internal/store"
contracthost "github.com/getcompanion-ai/computer-host/contract"
)
type fakeRuntime struct {
bootState firecracker.MachineState
bootCalls int
deleteCalls []firecracker.MachineState
lastSpec firecracker.MachineSpec
}
func (f *fakeRuntime) Boot(_ context.Context, spec firecracker.MachineSpec, _ []firecracker.NetworkAllocation) (*firecracker.MachineState, error) {
f.bootCalls++
f.lastSpec = spec
state := f.bootState
return &state, nil
}
func (f *fakeRuntime) Inspect(state firecracker.MachineState) (*firecracker.MachineState, error) {
copy := state
return &copy, nil
}
func (f *fakeRuntime) Delete(_ context.Context, state firecracker.MachineState) error {
f.deleteCalls = append(f.deleteCalls, state)
return nil
}
func TestCreateMachineStagesArtifactsAndPersistsState(t *testing.T) {
t.Parallel()
root := t.TempDir()
cfg := testConfig(root)
fileStore, err := store.NewFileStore(cfg.StatePath, cfg.OperationsPath)
if err != nil {
t.Fatalf("create file store: %v", err)
}
startedAt := time.Unix(1700000005, 0).UTC()
runtime := &fakeRuntime{
bootState: firecracker.MachineState{
ID: "vm-1",
Phase: firecracker.PhaseRunning,
PID: 4321,
RuntimeHost: "172.16.0.2",
SocketPath: filepath.Join(cfg.RuntimeDir, "machines", "vm-1", "root", "run", "firecracker.sock"),
TapName: "fctap0",
StartedAt: &startedAt,
},
}
hostDaemon, err := New(cfg, fileStore, runtime)
if err != nil {
t.Fatalf("create daemon: %v", err)
}
kernelPayload := []byte("kernel-image")
rootFSPayload := []byte("rootfs-image")
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
switch r.URL.Path {
case "/kernel":
_, _ = w.Write(kernelPayload)
case "/rootfs":
_, _ = w.Write(rootFSPayload)
default:
http.NotFound(w, r)
}
}))
defer server.Close()
response, err := hostDaemon.CreateMachine(context.Background(), contracthost.CreateMachineRequest{
MachineID: "vm-1",
Artifact: contracthost.ArtifactRef{
KernelImageURL: server.URL + "/kernel",
RootFSURL: server.URL + "/rootfs",
},
})
if err != nil {
t.Fatalf("create machine: %v", err)
}
if response.Machine.Phase != contracthost.MachinePhaseRunning {
t.Fatalf("machine phase mismatch: got %q", response.Machine.Phase)
}
if response.Machine.RuntimeHost != "172.16.0.2" {
t.Fatalf("runtime host mismatch: got %q", response.Machine.RuntimeHost)
}
if len(response.Machine.Ports) != 2 {
t.Fatalf("machine ports mismatch: got %d want 2", len(response.Machine.Ports))
}
if runtime.bootCalls != 1 {
t.Fatalf("boot call count mismatch: got %d want 1", runtime.bootCalls)
}
if runtime.lastSpec.KernelImagePath == "" || runtime.lastSpec.RootFSPath == "" {
t.Fatalf("runtime spec paths not populated: %#v", runtime.lastSpec)
}
if _, err := os.Stat(runtime.lastSpec.KernelImagePath); err != nil {
t.Fatalf("kernel artifact not staged: %v", err)
}
if _, err := os.Stat(runtime.lastSpec.RootFSPath); err != nil {
t.Fatalf("system disk not staged: %v", err)
}
artifact, err := fileStore.GetArtifact(context.Background(), response.Machine.Artifact)
if err != nil {
t.Fatalf("get artifact: %v", err)
}
if artifact.KernelImagePath == "" || artifact.RootFSPath == "" {
t.Fatalf("artifact paths missing: %#v", artifact)
}
if payload, err := os.ReadFile(artifact.KernelImagePath); err != nil {
t.Fatalf("read kernel artifact: %v", err)
} else if string(payload) != string(kernelPayload) {
t.Fatalf("kernel artifact payload mismatch: got %q", string(payload))
}
machine, err := fileStore.GetMachine(context.Background(), "vm-1")
if err != nil {
t.Fatalf("get machine: %v", err)
}
if machine.SystemVolumeID != "vm-1-system" {
t.Fatalf("system volume mismatch: got %q", machine.SystemVolumeID)
}
operations, err := fileStore.ListOperations(context.Background())
if err != nil {
t.Fatalf("list operations: %v", err)
}
if len(operations) != 0 {
t.Fatalf("operation journal should be empty after success: got %d entries", len(operations))
}
}
func TestCreateMachineRejectsNonHTTPArtifactURLs(t *testing.T) {
t.Parallel()
root := t.TempDir()
cfg := testConfig(root)
fileStore, err := store.NewFileStore(cfg.StatePath, cfg.OperationsPath)
if err != nil {
t.Fatalf("create file store: %v", err)
}
hostDaemon, err := New(cfg, fileStore, &fakeRuntime{})
if err != nil {
t.Fatalf("create daemon: %v", err)
}
_, err = hostDaemon.CreateMachine(context.Background(), contracthost.CreateMachineRequest{
MachineID: "vm-1",
Artifact: contracthost.ArtifactRef{
KernelImageURL: "file:///kernel",
RootFSURL: "https://example.com/rootfs",
},
})
if err == nil {
t.Fatal("expected create machine to fail for non-http artifact url")
}
if got := err.Error(); got != "artifact.kernel_image_url must use http or https" {
t.Fatalf("unexpected error: %q", got)
}
}
func TestDeleteMachineMissingIsNoOp(t *testing.T) {
t.Parallel()
root := t.TempDir()
cfg := testConfig(root)
fileStore, err := store.NewFileStore(cfg.StatePath, cfg.OperationsPath)
if err != nil {
t.Fatalf("create file store: %v", err)
}
runtime := &fakeRuntime{}
hostDaemon, err := New(cfg, fileStore, runtime)
if err != nil {
t.Fatalf("create daemon: %v", err)
}
if err := hostDaemon.DeleteMachine(context.Background(), "missing"); err != nil {
t.Fatalf("delete missing machine: %v", err)
}
if len(runtime.deleteCalls) != 0 {
t.Fatalf("delete runtime should not be called for missing machine")
}
}
func testConfig(root string) appconfig.Config {
return appconfig.Config{
RootDir: root,
StatePath: filepath.Join(root, "state", "state.json"),
OperationsPath: filepath.Join(root, "state", "ops.json"),
ArtifactsDir: filepath.Join(root, "artifacts"),
MachineDisksDir: filepath.Join(root, "machine-disks"),
RuntimeDir: filepath.Join(root, "runtime"),
SocketPath: filepath.Join(root, "firecracker-host.sock"),
FirecrackerBinaryPath: "/usr/bin/firecracker",
JailerBinaryPath: "/usr/bin/jailer",
}
}

274
internal/daemon/files.go Normal file
View file

@ -0,0 +1,274 @@
package daemon
import (
"context"
"crypto/sha256"
"encoding/hex"
"fmt"
"io"
"net/http"
"net/url"
"os"
"path/filepath"
"strings"
"github.com/getcompanion-ai/computer-host/internal/firecracker"
"github.com/getcompanion-ai/computer-host/internal/model"
contracthost "github.com/getcompanion-ai/computer-host/contract"
)
func (d *Daemon) systemVolumeID(machineID contracthost.MachineID) contracthost.VolumeID {
return contracthost.VolumeID(fmt.Sprintf("%s-system", machineID))
}
func (d *Daemon) systemVolumePath(machineID contracthost.MachineID) string {
return filepath.Join(d.config.MachineDisksDir, string(machineID), "system.img")
}
func (d *Daemon) machineRuntimeBaseDir(machineID contracthost.MachineID) string {
return filepath.Join(d.config.RuntimeDir, "machines", string(machineID))
}
func artifactKey(ref contracthost.ArtifactRef) string {
sum := sha256.Sum256([]byte(ref.KernelImageURL + "\n" + ref.RootFSURL))
return hex.EncodeToString(sum[:])
}
func cloneFile(source string, target string) error {
if err := os.MkdirAll(filepath.Dir(target), 0o755); err != nil {
return fmt.Errorf("create target dir for %q: %w", target, err)
}
sourceFile, err := os.Open(source)
if err != nil {
return fmt.Errorf("open source file %q: %w", source, err)
}
defer sourceFile.Close()
sourceInfo, err := sourceFile.Stat()
if err != nil {
return fmt.Errorf("stat source file %q: %w", source, err)
}
tmpPath := target + ".tmp"
targetFile, err := os.OpenFile(tmpPath, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0o644)
if err != nil {
return fmt.Errorf("open target file %q: %w", tmpPath, err)
}
if _, err := writeSparseFile(targetFile, sourceFile); err != nil {
targetFile.Close()
return fmt.Errorf("copy %q to %q: %w", source, tmpPath, err)
}
if err := targetFile.Truncate(sourceInfo.Size()); err != nil {
targetFile.Close()
return fmt.Errorf("truncate target file %q: %w", tmpPath, err)
}
if err := targetFile.Sync(); err != nil {
targetFile.Close()
return fmt.Errorf("sync target file %q: %w", tmpPath, err)
}
if err := targetFile.Close(); err != nil {
return fmt.Errorf("close target file %q: %w", tmpPath, err)
}
if err := os.Rename(tmpPath, target); err != nil {
return fmt.Errorf("rename target file %q to %q: %w", tmpPath, target, err)
}
if err := syncDir(filepath.Dir(target)); err != nil {
return err
}
return nil
}
func downloadFile(ctx context.Context, rawURL string, path string) error {
if _, err := os.Stat(path); err == nil {
return nil
} else if !os.IsNotExist(err) {
return fmt.Errorf("stat download target %q: %w", path, err)
}
if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil {
return fmt.Errorf("create download dir for %q: %w", path, err)
}
request, err := http.NewRequestWithContext(ctx, http.MethodGet, rawURL, nil)
if err != nil {
return fmt.Errorf("build download request for %q: %w", rawURL, err)
}
response, err := http.DefaultClient.Do(request)
if err != nil {
return fmt.Errorf("download %q: %w", rawURL, err)
}
defer response.Body.Close()
if response.StatusCode != http.StatusOK {
return fmt.Errorf("download %q: status %d", rawURL, response.StatusCode)
}
tmpPath := path + ".tmp"
file, err := os.OpenFile(tmpPath, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0o644)
if err != nil {
return fmt.Errorf("open download target %q: %w", tmpPath, err)
}
size, err := writeSparseFile(file, response.Body)
if err != nil {
file.Close()
return fmt.Errorf("write download target %q: %w", tmpPath, err)
}
if err := file.Truncate(size); err != nil {
file.Close()
return fmt.Errorf("truncate download target %q: %w", tmpPath, err)
}
if err := file.Sync(); err != nil {
file.Close()
return fmt.Errorf("sync download target %q: %w", tmpPath, err)
}
if err := file.Close(); err != nil {
return fmt.Errorf("close download target %q: %w", tmpPath, err)
}
if err := os.Rename(tmpPath, path); err != nil {
return fmt.Errorf("rename download target %q to %q: %w", tmpPath, path, err)
}
if err := syncDir(filepath.Dir(path)); err != nil {
return err
}
return nil
}
func writeSparseFile(targetFile *os.File, source io.Reader) (int64, error) {
buffer := make([]byte, defaultCopyBufferSize)
var size int64
for {
count, err := source.Read(buffer)
if count > 0 {
chunk := buffer[:count]
if isZeroChunk(chunk) {
if _, seekErr := targetFile.Seek(int64(count), io.SeekCurrent); seekErr != nil {
return size, seekErr
}
} else {
if _, writeErr := targetFile.Write(chunk); writeErr != nil {
return size, writeErr
}
}
size += int64(count)
}
if err == nil {
continue
}
if err == io.EOF {
return size, nil
}
return size, err
}
}
func isZeroChunk(chunk []byte) bool {
for _, value := range chunk {
if value != 0 {
return false
}
}
return true
}
func defaultMachinePorts() []contracthost.MachinePort {
return []contracthost.MachinePort{
{Name: contracthost.MachinePortNameSSH, Port: defaultSSHPort, Protocol: contracthost.PortProtocolTCP},
{Name: contracthost.MachinePortNameVNC, Port: defaultVNCPort, Protocol: contracthost.PortProtocolTCP},
}
}
func machineIDPtr(machineID contracthost.MachineID) *contracthost.MachineID {
value := machineID
return &value
}
func machineToContract(record model.MachineRecord) contracthost.Machine {
return contracthost.Machine{
ID: record.ID,
Artifact: record.Artifact,
SystemVolumeID: record.SystemVolumeID,
UserVolumeIDs: append([]contracthost.VolumeID(nil), record.UserVolumeIDs...),
RuntimeHost: record.RuntimeHost,
Ports: append([]contracthost.MachinePort(nil), record.Ports...),
Phase: record.Phase,
Error: record.Error,
CreatedAt: record.CreatedAt,
StartedAt: record.StartedAt,
}
}
func machineToRuntimeState(record model.MachineRecord) firecracker.MachineState {
phase := firecracker.PhaseStopped
switch record.Phase {
case contracthost.MachinePhaseRunning:
phase = firecracker.PhaseRunning
case contracthost.MachinePhaseFailed:
phase = firecracker.PhaseFailed
}
return firecracker.MachineState{
ID: firecracker.MachineID(record.ID),
Phase: phase,
PID: record.PID,
RuntimeHost: record.RuntimeHost,
SocketPath: record.SocketPath,
TapName: record.TapDevice,
StartedAt: record.StartedAt,
Error: record.Error,
}
}
func validateArtifactRef(ref contracthost.ArtifactRef) error {
if err := validateDownloadURL("artifact.kernel_image_url", ref.KernelImageURL); err != nil {
return err
}
if err := validateDownloadURL("artifact.rootfs_url", ref.RootFSURL); err != nil {
return err
}
return nil
}
func validateMachineID(machineID contracthost.MachineID) error {
value := strings.TrimSpace(string(machineID))
if value == "" {
return fmt.Errorf("machine_id is required")
}
if filepath.Base(value) != value {
return fmt.Errorf("machine_id %q must not contain path separators", machineID)
}
return nil
}
func validateDownloadURL(field string, raw string) error {
value := strings.TrimSpace(raw)
if value == "" {
return fmt.Errorf("%s is required", field)
}
parsed, err := url.Parse(value)
if err != nil {
return fmt.Errorf("%s is invalid: %w", field, err)
}
if parsed.Scheme != "http" && parsed.Scheme != "https" {
return fmt.Errorf("%s must use http or https", field)
}
if strings.TrimSpace(parsed.Host) == "" {
return fmt.Errorf("%s host is required", field)
}
return nil
}
func syncDir(path string) error {
dir, err := os.Open(path)
if err != nil {
return fmt.Errorf("open dir %q: %w", path, err)
}
if err := dir.Sync(); err != nil {
dir.Close()
return fmt.Errorf("sync dir %q: %w", path, err)
}
if err := dir.Close(); err != nil {
return fmt.Errorf("close dir %q: %w", path, err)
}
return nil
}

View file

@ -0,0 +1,94 @@
package daemon
import (
"bytes"
"io"
"os"
"path/filepath"
"syscall"
"testing"
)
func TestCloneFilePreservesSparseDiskUsage(t *testing.T) {
root := t.TempDir()
sourcePath := filepath.Join(root, "source.img")
targetPath := filepath.Join(root, "target.img")
sourceFile, err := os.OpenFile(sourcePath, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0o644)
if err != nil {
t.Fatalf("open source file: %v", err)
}
if _, err := sourceFile.Write([]byte("head")); err != nil {
sourceFile.Close()
t.Fatalf("write source prefix: %v", err)
}
if _, err := sourceFile.Seek(32<<20, io.SeekStart); err != nil {
sourceFile.Close()
t.Fatalf("seek source hole: %v", err)
}
if _, err := sourceFile.Write([]byte("tail")); err != nil {
sourceFile.Close()
t.Fatalf("write source suffix: %v", err)
}
if err := sourceFile.Close(); err != nil {
t.Fatalf("close source file: %v", err)
}
sourceInfo, err := os.Stat(sourcePath)
if err != nil {
t.Fatalf("stat source file: %v", err)
}
sourceUsage, err := allocatedBytes(sourcePath)
if err != nil {
t.Fatalf("allocated bytes for source: %v", err)
}
if sourceUsage >= sourceInfo.Size()/2 {
t.Skip("temp filesystem does not expose sparse allocation savings")
}
if err := cloneFile(sourcePath, targetPath); err != nil {
t.Fatalf("clone sparse file: %v", err)
}
targetInfo, err := os.Stat(targetPath)
if err != nil {
t.Fatalf("stat target file: %v", err)
}
if targetInfo.Size() != sourceInfo.Size() {
t.Fatalf("target size mismatch: got %d want %d", targetInfo.Size(), sourceInfo.Size())
}
targetUsage, err := allocatedBytes(targetPath)
if err != nil {
t.Fatalf("allocated bytes for target: %v", err)
}
if targetUsage >= targetInfo.Size()/2 {
t.Fatalf("target file is not sparse enough: allocated=%d size=%d", targetUsage, targetInfo.Size())
}
targetData, err := os.ReadFile(targetPath)
if err != nil {
t.Fatalf("read target file: %v", err)
}
if !bytes.Equal(targetData[:4], []byte("head")) {
t.Fatalf("target prefix mismatch: %q", string(targetData[:4]))
}
if !bytes.Equal(targetData[len(targetData)-4:], []byte("tail")) {
t.Fatalf("target suffix mismatch: %q", string(targetData[len(targetData)-4:]))
}
if !bytes.Equal(targetData[4:4+(1<<20)], make([]byte, 1<<20)) {
t.Fatal("target hole contents were not zeroed")
}
}
func allocatedBytes(path string) (int64, error) {
info, err := os.Stat(path)
if err != nil {
return 0, err
}
stat, ok := info.Sys().(*syscall.Stat_t)
if !ok {
return 0, syscall.EINVAL
}
return stat.Blocks * 512, nil
}

View file

@ -0,0 +1,327 @@
package daemon
import (
"context"
"fmt"
"os"
"path/filepath"
"strings"
"time"
"github.com/getcompanion-ai/computer-host/internal/firecracker"
"github.com/getcompanion-ai/computer-host/internal/model"
"github.com/getcompanion-ai/computer-host/internal/store"
contracthost "github.com/getcompanion-ai/computer-host/contract"
)
func (d *Daemon) GetMachine(ctx context.Context, id contracthost.MachineID) (*contracthost.GetMachineResponse, error) {
record, err := d.reconcileMachine(ctx, id)
if err != nil {
return nil, err
}
return &contracthost.GetMachineResponse{Machine: machineToContract(*record)}, nil
}
func (d *Daemon) ListMachines(ctx context.Context) (*contracthost.ListMachinesResponse, error) {
records, err := d.store.ListMachines(ctx)
if err != nil {
return nil, err
}
machines := make([]contracthost.Machine, 0, len(records))
for _, record := range records {
reconciled, err := d.reconcileMachine(ctx, record.ID)
if err != nil {
return nil, err
}
machines = append(machines, machineToContract(*reconciled))
}
return &contracthost.ListMachinesResponse{Machines: machines}, nil
}
func (d *Daemon) StopMachine(ctx context.Context, id contracthost.MachineID) error {
unlock := d.lockMachine(id)
defer unlock()
record, err := d.store.GetMachine(ctx, id)
if err != nil {
return err
}
if record.Phase == contracthost.MachinePhaseStopped {
return nil
}
if err := d.store.UpsertOperation(ctx, model.OperationRecord{
MachineID: id,
Type: model.MachineOperationStop,
StartedAt: time.Now().UTC(),
}); err != nil {
return err
}
clearOperation := false
defer func() {
if clearOperation {
_ = d.store.DeleteOperation(context.Background(), id)
}
}()
if err := d.stopMachineRecord(ctx, record); err != nil {
return err
}
clearOperation = true
return nil
}
func (d *Daemon) DeleteMachine(ctx context.Context, id contracthost.MachineID) error {
unlock := d.lockMachine(id)
defer unlock()
record, err := d.store.GetMachine(ctx, id)
if err == store.ErrNotFound {
return nil
}
if err != nil {
return err
}
if err := d.store.UpsertOperation(ctx, model.OperationRecord{
MachineID: id,
Type: model.MachineOperationDelete,
StartedAt: time.Now().UTC(),
}); err != nil {
return err
}
clearOperation := false
defer func() {
if clearOperation {
_ = d.store.DeleteOperation(context.Background(), id)
}
}()
if err := d.deleteMachineRecord(ctx, record); err != nil {
return err
}
clearOperation = true
return nil
}
func (d *Daemon) Reconcile(ctx context.Context) error {
operations, err := d.store.ListOperations(ctx)
if err != nil {
return err
}
for _, operation := range operations {
switch operation.Type {
case model.MachineOperationCreate:
if err := d.reconcileCreate(ctx, operation.MachineID); err != nil {
return err
}
case model.MachineOperationStop:
if err := d.reconcileStop(ctx, operation.MachineID); err != nil {
return err
}
case model.MachineOperationDelete:
if err := d.reconcileDelete(ctx, operation.MachineID); err != nil {
return err
}
default:
return fmt.Errorf("unsupported operation type %q", operation.Type)
}
}
records, err := d.store.ListMachines(ctx)
if err != nil {
return err
}
for _, record := range records {
if _, err := d.reconcileMachine(ctx, record.ID); err != nil {
return err
}
}
return nil
}
func (d *Daemon) listRunningNetworks(ctx context.Context, ignore contracthost.MachineID) ([]firecracker.NetworkAllocation, error) {
records, err := d.store.ListMachines(ctx)
if err != nil {
return nil, err
}
networks := make([]firecracker.NetworkAllocation, 0, len(records))
for _, record := range records {
if record.ID == ignore || record.Phase != contracthost.MachinePhaseRunning {
continue
}
if strings.TrimSpace(record.RuntimeHost) == "" || strings.TrimSpace(record.TapDevice) == "" {
continue
}
network, err := firecracker.AllocationFromGuestIP(record.RuntimeHost, record.TapDevice)
if err != nil {
return nil, err
}
networks = append(networks, network)
}
return networks, nil
}
func (d *Daemon) reconcileCreate(ctx context.Context, machineID contracthost.MachineID) error {
_, err := d.store.GetMachine(ctx, machineID)
if err == nil {
if _, err := d.reconcileMachine(ctx, machineID); err != nil {
return err
}
return d.store.DeleteOperation(ctx, machineID)
}
if err != store.ErrNotFound {
return err
}
if err := os.Remove(d.systemVolumePath(machineID)); err != nil && !os.IsNotExist(err) {
return fmt.Errorf("cleanup system volume for %q: %w", machineID, err)
}
if err := d.store.DeleteVolume(ctx, d.systemVolumeID(machineID)); err != nil && err != store.ErrNotFound {
return err
}
if err := d.detachVolumesForMachine(ctx, machineID); err != nil {
return err
}
_ = os.RemoveAll(filepath.Dir(d.systemVolumePath(machineID)))
if err := os.RemoveAll(d.machineRuntimeBaseDir(machineID)); err != nil {
return fmt.Errorf("cleanup runtime dir for %q: %w", machineID, err)
}
return d.store.DeleteOperation(ctx, machineID)
}
func (d *Daemon) reconcileStop(ctx context.Context, machineID contracthost.MachineID) error {
record, err := d.store.GetMachine(ctx, machineID)
if err == store.ErrNotFound {
return d.store.DeleteOperation(ctx, machineID)
}
if err != nil {
return err
}
if err := d.stopMachineRecord(ctx, record); err != nil {
return err
}
return d.store.DeleteOperation(ctx, machineID)
}
func (d *Daemon) reconcileDelete(ctx context.Context, machineID contracthost.MachineID) error {
record, err := d.store.GetMachine(ctx, machineID)
if err == store.ErrNotFound {
if err := os.Remove(d.systemVolumePath(machineID)); err != nil && !os.IsNotExist(err) {
return err
}
if err := d.store.DeleteVolume(ctx, d.systemVolumeID(machineID)); err != nil && err != store.ErrNotFound {
return err
}
if err := d.detachVolumesForMachine(ctx, machineID); err != nil {
return err
}
_ = os.RemoveAll(filepath.Dir(d.systemVolumePath(machineID)))
_ = os.RemoveAll(d.machineRuntimeBaseDir(machineID))
return d.store.DeleteOperation(ctx, machineID)
}
if err != nil {
return err
}
if err := d.deleteMachineRecord(ctx, record); err != nil {
return err
}
return d.store.DeleteOperation(ctx, machineID)
}
func (d *Daemon) reconcileMachine(ctx context.Context, machineID contracthost.MachineID) (*model.MachineRecord, error) {
unlock := d.lockMachine(machineID)
defer unlock()
record, err := d.store.GetMachine(ctx, machineID)
if err != nil {
return nil, err
}
if record.Phase != contracthost.MachinePhaseRunning {
return record, nil
}
state, err := d.runtime.Inspect(machineToRuntimeState(*record))
if err != nil {
return nil, err
}
if state.Phase == firecracker.PhaseRunning {
return record, nil
}
if err := d.runtime.Delete(ctx, *state); err != nil {
return nil, err
}
record.Phase = contracthost.MachinePhaseFailed
record.Error = state.Error
record.PID = 0
record.SocketPath = ""
record.RuntimeHost = ""
record.TapDevice = ""
record.StartedAt = nil
if err := d.store.UpdateMachine(ctx, *record); err != nil {
return nil, err
}
return record, nil
}
func (d *Daemon) deleteMachineRecord(ctx context.Context, record *model.MachineRecord) error {
if err := d.runtime.Delete(ctx, machineToRuntimeState(*record)); err != nil {
return err
}
if err := d.detachVolumesForMachine(ctx, record.ID); err != nil {
return err
}
systemVolume, err := d.store.GetVolume(ctx, record.SystemVolumeID)
if err != nil {
return err
}
if err := os.Remove(systemVolume.Path); err != nil && !os.IsNotExist(err) {
return fmt.Errorf("remove system volume %q: %w", systemVolume.Path, err)
}
if err := os.RemoveAll(filepath.Dir(systemVolume.Path)); err != nil {
return fmt.Errorf("remove machine disk dir %q: %w", filepath.Dir(systemVolume.Path), err)
}
if err := d.store.DeleteVolume(ctx, record.SystemVolumeID); err != nil {
return err
}
return d.store.DeleteMachine(ctx, record.ID)
}
func (d *Daemon) stopMachineRecord(ctx context.Context, record *model.MachineRecord) error {
if err := d.runtime.Delete(ctx, machineToRuntimeState(*record)); err != nil {
return err
}
record.Phase = contracthost.MachinePhaseStopped
record.Error = ""
record.PID = 0
record.SocketPath = ""
record.RuntimeHost = ""
record.TapDevice = ""
record.StartedAt = nil
return d.store.UpdateMachine(ctx, *record)
}
func (d *Daemon) detachVolumesForMachine(ctx context.Context, machineID contracthost.MachineID) error {
volumes, err := d.store.ListVolumes(ctx)
if err != nil {
return err
}
for _, volume := range volumes {
if volume.AttachedMachineID == nil || *volume.AttachedMachineID != machineID {
continue
}
volume.AttachedMachineID = nil
if err := d.store.UpdateVolume(ctx, volume); err != nil {
return err
}
}
return nil
}