mirror of
https://github.com/getcompanion-ai/computer-host.git
synced 2026-04-15 03:00:42 +00:00
* feat: add Firecracker API client methods for VM pause/resume and snapshots
Add PatchVm, GetVm, PutSnapshotCreate, and PutSnapshotLoad methods to the
API client, along with supporting types (VmState, SnapshotCreateParams,
SnapshotLoadParams, MemBackend).
* feat: add snapshot data layer - contract types, model, store, config
Add SnapshotID and snapshot contract types, SnapshotRecord model,
store interface CRUD methods with file store implementation,
snapshot paths helper, SnapshotsDir config, and directory creation.
* feat: add runtime methods for VM pause, resume, snapshot, and restore
Implement Pause, Resume, CreateSnapshot, and RestoreBoot on the
firecracker Runtime. RestoreBoot launches a jailer, stages snapshot
files into the chroot, loads the snapshot, and resumes the VM.
* feat: add daemon snapshot create, restore, and reconciliation logic
Implement CreateSnapshot (pause, snapshot, COW-copy disk, resume),
RestoreSnapshot (COW-copy disk, RestoreBoot, wait for guest),
GetSnapshot, ListSnapshots, DeleteSnapshotByID, and crash recovery
reconciliation for snapshot and restore operations.
* feat: add HTTP endpoints for snapshot create, get, list, delete, restore
Wire 5 snapshot routes: POST /machines/{id}/snapshots (create),
GET /machines/{id}/snapshots (list), GET /snapshots/{id} (get),
DELETE /snapshots/{id} (delete), POST /snapshots/{id}/restore (restore).
* fix: cross-device rename, restore network, and snapshot cleanup
- Replace os.Rename with copy+remove for moving snapshot files out of
/proc/<pid>/root/ (cross-device link error on Linux)
- Reconfigure network interface after snapshot load so the restored VM
uses its own tap device instead of the source VM's
- Clean partial snapshot dirs immediately on failure instead of only
via reconcile
- Reject snapshot requests while a machine operation is already pending
* fix: test and modify snapshot runtime
* feat: snapshot lifecycle update, align runtime issues between host image
and daemon
111 lines
3.3 KiB
Go
111 lines
3.3 KiB
Go
package daemon
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"os"
|
|
"sync"
|
|
"time"
|
|
|
|
appconfig "github.com/getcompanion-ai/computer-host/internal/config"
|
|
"github.com/getcompanion-ai/computer-host/internal/firecracker"
|
|
"github.com/getcompanion-ai/computer-host/internal/store"
|
|
contracthost "github.com/getcompanion-ai/computer-host/contract"
|
|
)
|
|
|
|
const (
|
|
defaultGuestKernelArgs = "console=ttyS0 reboot=k panic=1 pci=off"
|
|
defaultGuestMemoryMiB = int64(512)
|
|
defaultGuestVCPUs = int64(1)
|
|
defaultSSHPort = uint16(2222)
|
|
defaultVNCPort = uint16(6080)
|
|
defaultCopyBufferSize = 1024 * 1024
|
|
defaultGuestDialTimeout = 500 * time.Millisecond
|
|
defaultGuestReadyPollInterval = 100 * time.Millisecond
|
|
defaultGuestReadyTimeout = 30 * time.Second
|
|
)
|
|
|
|
type Runtime interface {
|
|
Boot(context.Context, firecracker.MachineSpec, []firecracker.NetworkAllocation) (*firecracker.MachineState, error)
|
|
Inspect(firecracker.MachineState) (*firecracker.MachineState, error)
|
|
Delete(context.Context, firecracker.MachineState) error
|
|
Pause(context.Context, firecracker.MachineState) error
|
|
Resume(context.Context, firecracker.MachineState) error
|
|
CreateSnapshot(context.Context, firecracker.MachineState, firecracker.SnapshotPaths) error
|
|
RestoreBoot(context.Context, firecracker.SnapshotLoadSpec, []firecracker.NetworkAllocation) (*firecracker.MachineState, error)
|
|
}
|
|
|
|
type Daemon struct {
|
|
config appconfig.Config
|
|
store store.Store
|
|
runtime Runtime
|
|
|
|
reconfigureGuestIdentity func(context.Context, string, contracthost.MachineID) error
|
|
|
|
locksMu sync.Mutex
|
|
machineLocks map[contracthost.MachineID]*sync.Mutex
|
|
artifactLocks map[string]*sync.Mutex
|
|
}
|
|
|
|
func New(cfg appconfig.Config, store store.Store, runtime Runtime) (*Daemon, error) {
|
|
if err := cfg.Validate(); err != nil {
|
|
return nil, err
|
|
}
|
|
if store == nil {
|
|
return nil, fmt.Errorf("store is required")
|
|
}
|
|
if runtime == nil {
|
|
return nil, fmt.Errorf("runtime is required")
|
|
}
|
|
for _, dir := range []string{cfg.ArtifactsDir, cfg.MachineDisksDir, cfg.SnapshotsDir, cfg.RuntimeDir} {
|
|
if err := os.MkdirAll(dir, 0o755); err != nil {
|
|
return nil, fmt.Errorf("create daemon dir %q: %w", dir, err)
|
|
}
|
|
}
|
|
daemon := &Daemon{
|
|
config: cfg,
|
|
store: store,
|
|
runtime: runtime,
|
|
reconfigureGuestIdentity: nil,
|
|
machineLocks: make(map[contracthost.MachineID]*sync.Mutex),
|
|
artifactLocks: make(map[string]*sync.Mutex),
|
|
}
|
|
daemon.reconfigureGuestIdentity = daemon.reconfigureGuestIdentityOverSSH
|
|
if err := daemon.ensureBackendSSHKeyPair(); err != nil {
|
|
return nil, err
|
|
}
|
|
return daemon, nil
|
|
}
|
|
|
|
func (d *Daemon) Health(ctx context.Context) (*contracthost.HealthResponse, error) {
|
|
if _, err := d.store.ListMachines(ctx); err != nil {
|
|
return nil, err
|
|
}
|
|
return &contracthost.HealthResponse{OK: true}, nil
|
|
}
|
|
|
|
func (d *Daemon) lockMachine(machineID contracthost.MachineID) func() {
|
|
d.locksMu.Lock()
|
|
lock, ok := d.machineLocks[machineID]
|
|
if !ok {
|
|
lock = &sync.Mutex{}
|
|
d.machineLocks[machineID] = lock
|
|
}
|
|
d.locksMu.Unlock()
|
|
|
|
lock.Lock()
|
|
return lock.Unlock
|
|
}
|
|
|
|
func (d *Daemon) lockArtifact(key string) func() {
|
|
d.locksMu.Lock()
|
|
lock, ok := d.artifactLocks[key]
|
|
if !ok {
|
|
lock = &sync.Mutex{}
|
|
d.artifactLocks[key] = lock
|
|
}
|
|
d.locksMu.Unlock()
|
|
|
|
lock.Lock()
|
|
return lock.Unlock
|
|
}
|