diff --git a/internal/config/config.go b/internal/config/config.go index b76e8ad..8059457 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -20,6 +20,7 @@ type Config struct { MachineDisksDir string SnapshotsDir string RuntimeDir string + DiskCloneMode DiskCloneMode SocketPath string HTTPAddr string EgressInterface string @@ -28,6 +29,16 @@ type Config struct { GuestLoginCAPublicKey string } +// DiskCloneMode controls how the daemon materializes writable machine disks. +type DiskCloneMode string + +const ( + // DiskCloneModeReflink requires an O(1) copy-on-write clone and never falls back to a full copy. + DiskCloneModeReflink DiskCloneMode = "reflink" + // DiskCloneModeCopy performs a full sparse copy. Use only for local development or emergency fallback. + DiskCloneModeCopy DiskCloneMode = "copy" +) + // Load loads and validates the firecracker-host daemon configuration from the environment. func Load() (Config, error) { rootDir := filepath.Clean(strings.TrimSpace(os.Getenv("FIRECRACKER_HOST_ROOT_DIR"))) @@ -39,6 +50,7 @@ func Load() (Config, error) { MachineDisksDir: filepath.Join(rootDir, "machine-disks"), SnapshotsDir: filepath.Join(rootDir, "snapshots"), RuntimeDir: filepath.Join(rootDir, "runtime"), + DiskCloneMode: loadDiskCloneMode(os.Getenv("FIRECRACKER_HOST_DISK_CLONE_MODE")), SocketPath: filepath.Join(rootDir, defaultSocketName), HTTPAddr: strings.TrimSpace(os.Getenv("FIRECRACKER_HOST_HTTP_ADDR")), EgressInterface: strings.TrimSpace(os.Getenv("FIRECRACKER_HOST_EGRESS_INTERFACE")), @@ -81,6 +93,9 @@ func (c Config) Validate() error { if strings.TrimSpace(c.RuntimeDir) == "" { return fmt.Errorf("runtime dir is required") } + if err := c.DiskCloneMode.Validate(); err != nil { + return err + } if strings.TrimSpace(c.SocketPath) == "" { return fmt.Errorf("socket path is required") } @@ -99,3 +114,21 @@ func (c Config) FirecrackerRuntimeConfig() firecracker.RuntimeConfig { JailerBinaryPath: c.JailerBinaryPath, } } + +func loadDiskCloneMode(raw string) DiskCloneMode { + value := strings.TrimSpace(raw) + if value == "" { + return DiskCloneModeReflink + } + return DiskCloneMode(value) +} + +// Validate reports whether the clone mode is safe to use. +func (m DiskCloneMode) Validate() error { + switch m { + case DiskCloneModeReflink, DiskCloneModeCopy: + return nil + default: + return fmt.Errorf("FIRECRACKER_HOST_DISK_CLONE_MODE must be %q or %q", DiskCloneModeReflink, DiskCloneModeCopy) + } +} diff --git a/internal/config/config_test.go b/internal/config/config_test.go new file mode 100644 index 0000000..3c15434 --- /dev/null +++ b/internal/config/config_test.go @@ -0,0 +1,40 @@ +package config + +import "testing" + +func TestLoadDiskCloneModeDefaultsToReflink(t *testing.T) { + t.Parallel() + + if got := loadDiskCloneMode(""); got != DiskCloneModeReflink { + t.Fatalf("disk clone mode = %q, want %q", got, DiskCloneModeReflink) + } +} + +func TestDiskCloneModeValidate(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + mode DiskCloneMode + wantErr bool + }{ + {name: "reflink", mode: DiskCloneModeReflink}, + {name: "copy", mode: DiskCloneModeCopy}, + {name: "empty", mode: "", wantErr: true}, + {name: "unknown", mode: "auto", wantErr: true}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + err := tt.mode.Validate() + if tt.wantErr && err == nil { + t.Fatal("Validate() error = nil, want error") + } + if !tt.wantErr && err != nil { + t.Fatalf("Validate() error = %v, want nil", err) + } + }) + } +} diff --git a/internal/daemon/create.go b/internal/daemon/create.go index d89b859..26fa435 100644 --- a/internal/daemon/create.go +++ b/internal/daemon/create.go @@ -7,10 +7,10 @@ import ( "path/filepath" "time" - contracthost "github.com/getcompanion-ai/computer-host/contract" "github.com/getcompanion-ai/computer-host/internal/firecracker" "github.com/getcompanion-ai/computer-host/internal/model" "github.com/getcompanion-ai/computer-host/internal/store" + contracthost "github.com/getcompanion-ai/computer-host/contract" ) func (d *Daemon) CreateMachine(ctx context.Context, req contracthost.CreateMachineRequest) (*contracthost.CreateMachineResponse, error) { @@ -66,8 +66,8 @@ func (d *Daemon) CreateMachine(ctx context.Context, req contracthost.CreateMachi if err := os.MkdirAll(filepath.Dir(systemVolumePath), 0o755); err != nil { return nil, fmt.Errorf("create system volume dir for %q: %w", req.MachineID, err) } - if err := cowCopyFile(artifact.RootFSPath, systemVolumePath); err != nil { - return nil, err + if err := cloneDiskFile(artifact.RootFSPath, systemVolumePath, d.config.DiskCloneMode); err != nil { + return nil, fmt.Errorf("clone rootfs for %q: %w", req.MachineID, err) } removeSystemVolumeOnFailure := true defer func() { diff --git a/internal/daemon/daemon_test.go b/internal/daemon/daemon_test.go index 790da33..1b41904 100644 --- a/internal/daemon/daemon_test.go +++ b/internal/daemon/daemon_test.go @@ -13,11 +13,11 @@ import ( "testing" "time" - contracthost "github.com/getcompanion-ai/computer-host/contract" appconfig "github.com/getcompanion-ai/computer-host/internal/config" "github.com/getcompanion-ai/computer-host/internal/firecracker" "github.com/getcompanion-ai/computer-host/internal/model" "github.com/getcompanion-ai/computer-host/internal/store" + contracthost "github.com/getcompanion-ai/computer-host/contract" ) type fakeRuntime struct { @@ -757,6 +757,7 @@ func testConfig(root string) appconfig.Config { MachineDisksDir: filepath.Join(root, "machine-disks"), SnapshotsDir: filepath.Join(root, "snapshots"), RuntimeDir: filepath.Join(root, "runtime"), + DiskCloneMode: appconfig.DiskCloneModeCopy, SocketPath: filepath.Join(root, "firecracker-host.sock"), EgressInterface: "eth0", FirecrackerBinaryPath: "/usr/bin/firecracker", diff --git a/internal/daemon/files.go b/internal/daemon/files.go index 54930a4..9d56182 100644 --- a/internal/daemon/files.go +++ b/internal/daemon/files.go @@ -13,7 +13,9 @@ import ( "os/exec" "path/filepath" "strings" + "syscall" + appconfig "github.com/getcompanion-ai/computer-host/internal/config" "github.com/getcompanion-ai/computer-host/internal/firecracker" "github.com/getcompanion-ai/computer-host/internal/model" contracthost "github.com/getcompanion-ai/computer-host/contract" @@ -92,6 +94,70 @@ func cloneFile(source string, target string) error { return nil } +func cloneDiskFile(source string, target string, mode appconfig.DiskCloneMode) error { + switch mode { + case appconfig.DiskCloneModeReflink: + return reflinkFile(source, target) + case appconfig.DiskCloneModeCopy: + return cloneFile(source, target) + default: + return fmt.Errorf("unsupported disk clone mode %q", mode) + } +} + +func reflinkFile(source string, target string) error { + if err := os.MkdirAll(filepath.Dir(target), 0o755); err != nil { + return fmt.Errorf("create target dir for %q: %w", target, err) + } + + sourceFile, err := os.Open(source) + if err != nil { + return fmt.Errorf("open source file %q: %w", source, err) + } + defer func() { + _ = sourceFile.Close() + }() + + sourceInfo, err := sourceFile.Stat() + if err != nil { + return fmt.Errorf("stat source file %q: %w", source, err) + } + + tmpPath := target + ".tmp" + targetFile, err := os.OpenFile(tmpPath, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, sourceInfo.Mode().Perm()) + if err != nil { + return fmt.Errorf("open target file %q: %w", tmpPath, err) + } + + if err := ioctlFileClone(targetFile, sourceFile); err != nil { + _ = targetFile.Close() + _ = os.Remove(tmpPath) + return fmt.Errorf("reflink clone %q to %q: %w", source, tmpPath, err) + } + if err := targetFile.Close(); err != nil { + _ = os.Remove(tmpPath) + return fmt.Errorf("close target file %q: %w", tmpPath, err) + } + if err := os.Rename(tmpPath, target); err != nil { + _ = os.Remove(tmpPath) + return fmt.Errorf("rename target file %q to %q: %w", tmpPath, target, err) + } + if err := syncDir(filepath.Dir(target)); err != nil { + return err + } + return nil +} + +func ioctlFileClone(targetFile *os.File, sourceFile *os.File) error { + const ficlone = 0x40049409 + + _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, targetFile.Fd(), ficlone, sourceFile.Fd()) + if errno != 0 { + return errno + } + return nil +} + func downloadFile(ctx context.Context, rawURL string, path string) error { if _, err := os.Stat(path); err == nil { return nil diff --git a/internal/daemon/files_test.go b/internal/daemon/files_test.go index f9458f8..3ebb856 100644 --- a/internal/daemon/files_test.go +++ b/internal/daemon/files_test.go @@ -7,9 +7,11 @@ import ( "path/filepath" "syscall" "testing" + + appconfig "github.com/getcompanion-ai/computer-host/internal/config" ) -func TestCloneFilePreservesSparseDiskUsage(t *testing.T) { +func TestCloneDiskFileCopyPreservesSparseDiskUsage(t *testing.T) { root := t.TempDir() sourcePath := filepath.Join(root, "source.img") targetPath := filepath.Join(root, "target.img") @@ -46,7 +48,7 @@ func TestCloneFilePreservesSparseDiskUsage(t *testing.T) { t.Skip("temp filesystem does not expose sparse allocation savings") } - if err := cloneFile(sourcePath, targetPath); err != nil { + if err := cloneDiskFile(sourcePath, targetPath, appconfig.DiskCloneModeCopy); err != nil { t.Fatalf("clone sparse file: %v", err) } @@ -81,6 +83,29 @@ func TestCloneFilePreservesSparseDiskUsage(t *testing.T) { } } +func TestCloneDiskFileReflinkMode(t *testing.T) { + root := t.TempDir() + sourcePath := filepath.Join(root, "source.img") + targetPath := filepath.Join(root, "target.img") + + if err := os.WriteFile(sourcePath, []byte("rootfs"), 0o644); err != nil { + t.Fatalf("write source file: %v", err) + } + + err := cloneDiskFile(sourcePath, targetPath, appconfig.DiskCloneModeReflink) + if err != nil { + t.Skipf("temp filesystem does not support reflinks: %v", err) + } + + targetData, err := os.ReadFile(targetPath) + if err != nil { + t.Fatalf("read target file: %v", err) + } + if !bytes.Equal(targetData, []byte("rootfs")) { + t.Fatalf("target data mismatch: %q", string(targetData)) + } +} + func allocatedBytes(path string) (int64, error) { info, err := os.Stat(path) if err != nil { diff --git a/internal/daemon/snapshot.go b/internal/daemon/snapshot.go index 7cd593e..f14ffd6 100644 --- a/internal/daemon/snapshot.go +++ b/internal/daemon/snapshot.go @@ -5,17 +5,16 @@ import ( "fmt" "io" "os" - "os/exec" "path/filepath" "sort" "strconv" "strings" "time" - contracthost "github.com/getcompanion-ai/computer-host/contract" "github.com/getcompanion-ai/computer-host/internal/firecracker" "github.com/getcompanion-ai/computer-host/internal/model" "github.com/getcompanion-ai/computer-host/internal/store" + contracthost "github.com/getcompanion-ai/computer-host/contract" ) func (d *Daemon) CreateSnapshot(ctx context.Context, machineID contracthost.MachineID, req contracthost.CreateSnapshotRequest) (*contracthost.CreateSnapshotResponse, error) { @@ -104,7 +103,7 @@ func (d *Daemon) CreateSnapshot(ctx context.Context, machineID contracthost.Mach return nil, fmt.Errorf("get system volume: %w", err) } systemDiskTarget := filepath.Join(snapshotDir, "system.img") - if err := cowCopyFile(systemVolume.Path, systemDiskTarget); err != nil { + if err := cloneDiskFile(systemVolume.Path, systemDiskTarget, d.config.DiskCloneMode); err != nil { _ = d.runtime.Resume(ctx, runtimeState) _ = os.RemoveAll(snapshotDir) return nil, fmt.Errorf("copy system disk: %w", err) @@ -119,7 +118,7 @@ func (d *Daemon) CreateSnapshot(ctx context.Context, machineID contracthost.Mach } driveID := fmt.Sprintf("user-%d", i) targetPath := filepath.Join(snapshotDir, driveID+".img") - if err := cowCopyFile(volume.Path, targetPath); err != nil { + if err := cloneDiskFile(volume.Path, targetPath, d.config.DiskCloneMode); err != nil { _ = d.runtime.Resume(ctx, runtimeState) _ = os.RemoveAll(snapshotDir) return nil, fmt.Errorf("copy attached volume %q: %w", volumeID, err) @@ -303,7 +302,7 @@ func (d *Daemon) RestoreSnapshot(ctx context.Context, snapshotID contracthost.Sn clearOperation = true return nil, fmt.Errorf("snapshot %q is missing vmstate artifact", snapshotID) } - if err := cowCopyFile(systemDiskPath.LocalPath, newSystemDiskPath); err != nil { + if err := cloneDiskFile(systemDiskPath.LocalPath, newSystemDiskPath, d.config.DiskCloneMode); err != nil { clearOperation = true return nil, fmt.Errorf("copy system disk for restore: %w", err) } @@ -320,7 +319,7 @@ func (d *Daemon) RestoreSnapshot(ctx context.Context, snapshotID contracthost.Sn driveID := strings.TrimSuffix(name, filepath.Ext(name)) volumeID := contracthost.VolumeID(fmt.Sprintf("%s-%s", req.MachineID, driveID)) volumePath := filepath.Join(d.config.MachineDisksDir, string(req.MachineID), name) - if err := cowCopyFile(restored.LocalPath, volumePath); err != nil { + if err := cloneDiskFile(restored.LocalPath, volumePath, d.config.DiskCloneMode); err != nil { clearOperation = true return nil, fmt.Errorf("copy restored drive %q: %w", driveID, err) } @@ -596,19 +595,3 @@ func moveFile(src, dst string) error { } return os.Remove(src) } - -func cowCopyFile(source string, target string) error { - if err := os.MkdirAll(filepath.Dir(target), 0o755); err != nil { - return fmt.Errorf("create target dir for %q: %w", target, err) - } - cmd := exec.Command("cp", "--reflink=auto", "--sparse=always", source, target) - output, err := cmd.CombinedOutput() - if err != nil { - if cloneErr := cloneFile(source, target); cloneErr == nil { - return nil - } else { - return fmt.Errorf("cow copy %q to %q: cp failed: %w: %s; clone fallback failed: %w", source, target, err, string(output), cloneErr) - } - } - return nil -}