mirror of
https://github.com/getcompanion-ai/computer-host.git
synced 2026-04-15 10:05:13 +00:00
host api alignment (#7)
* feat: add Firecracker API client methods for VM pause/resume and snapshots
Add PatchVm, GetVm, PutSnapshotCreate, and PutSnapshotLoad methods to the
API client, along with supporting types (VmState, SnapshotCreateParams,
SnapshotLoadParams, MemBackend).
* feat: add snapshot data layer - contract types, model, store, config
Add SnapshotID and snapshot contract types, SnapshotRecord model,
store interface CRUD methods with file store implementation,
snapshot paths helper, SnapshotsDir config, and directory creation.
* feat: add runtime methods for VM pause, resume, snapshot, and restore
Implement Pause, Resume, CreateSnapshot, and RestoreBoot on the
firecracker Runtime. RestoreBoot launches a jailer, stages snapshot
files into the chroot, loads the snapshot, and resumes the VM.
* feat: add daemon snapshot create, restore, and reconciliation logic
Implement CreateSnapshot (pause, snapshot, COW-copy disk, resume),
RestoreSnapshot (COW-copy disk, RestoreBoot, wait for guest),
GetSnapshot, ListSnapshots, DeleteSnapshotByID, and crash recovery
reconciliation for snapshot and restore operations.
* feat: add HTTP endpoints for snapshot create, get, list, delete, restore
Wire 5 snapshot routes: POST /machines/{id}/snapshots (create),
GET /machines/{id}/snapshots (list), GET /snapshots/{id} (get),
DELETE /snapshots/{id} (delete), POST /snapshots/{id}/restore (restore).
* fix: cross-device rename, restore network, and snapshot cleanup
- Replace os.Rename with copy+remove for moving snapshot files out of
/proc/<pid>/root/ (cross-device link error on Linux)
- Reconfigure network interface after snapshot load so the restored VM
uses its own tap device instead of the source VM's
- Clean partial snapshot dirs immediately on failure instead of only
via reconcile
- Reject snapshot requests while a machine operation is already pending
* fix: test and modify snapshot runtime
* feat: snapshot lifecycle update, align runtime issues between host image
and daemon
This commit is contained in:
parent
9382de7eba
commit
b5c97aef07
17 changed files with 1287 additions and 20 deletions
|
|
@ -220,6 +220,155 @@ func (r *Runtime) Delete(ctx context.Context, state MachineState) error {
|
|||
return nil
|
||||
}
|
||||
|
||||
func (r *Runtime) Pause(ctx context.Context, state MachineState) error {
|
||||
client := newAPIClient(state.SocketPath)
|
||||
return client.PatchVm(ctx, VmStatePaused)
|
||||
}
|
||||
|
||||
func (r *Runtime) Resume(ctx context.Context, state MachineState) error {
|
||||
client := newAPIClient(state.SocketPath)
|
||||
return client.PatchVm(ctx, VmStateResumed)
|
||||
}
|
||||
|
||||
func (r *Runtime) CreateSnapshot(ctx context.Context, state MachineState, paths SnapshotPaths) error {
|
||||
client := newAPIClient(state.SocketPath)
|
||||
return client.PutSnapshotCreate(ctx, SnapshotCreateParams{
|
||||
MemFilePath: paths.MemFilePath,
|
||||
SnapshotPath: paths.StateFilePath,
|
||||
SnapshotType: "Full",
|
||||
})
|
||||
}
|
||||
|
||||
func (r *Runtime) RestoreBoot(ctx context.Context, loadSpec SnapshotLoadSpec, usedNetworks []NetworkAllocation) (*MachineState, error) {
|
||||
cleanup := func(network NetworkAllocation, paths machinePaths, command *exec.Cmd, firecrackerPID int) {
|
||||
if preserveFailureArtifacts() {
|
||||
return
|
||||
}
|
||||
cleanupRunningProcess(firecrackerPID)
|
||||
cleanupStartedProcess(command)
|
||||
_ = r.networkProvisioner.Remove(context.Background(), network)
|
||||
if paths.BaseDir != "" {
|
||||
_ = os.RemoveAll(paths.BaseDir)
|
||||
}
|
||||
}
|
||||
|
||||
var network NetworkAllocation
|
||||
if loadSpec.Network != nil {
|
||||
network = *loadSpec.Network
|
||||
} else {
|
||||
var err error
|
||||
network, err = r.networkAllocator.Allocate(usedNetworks)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
paths, err := buildMachinePaths(r.rootDir, loadSpec.ID, r.firecrackerBinaryPath)
|
||||
if err != nil {
|
||||
cleanup(network, machinePaths{}, nil, 0)
|
||||
return nil, err
|
||||
}
|
||||
if err := os.MkdirAll(paths.LogDir, 0o755); err != nil {
|
||||
cleanup(network, paths, nil, 0)
|
||||
return nil, fmt.Errorf("create machine log dir %q: %w", paths.LogDir, err)
|
||||
}
|
||||
if err := r.networkProvisioner.Ensure(ctx, network); err != nil {
|
||||
cleanup(network, paths, nil, 0)
|
||||
return nil, err
|
||||
}
|
||||
|
||||
command, err := launchJailedFirecracker(paths, loadSpec.ID, r.firecrackerBinaryPath, r.jailerBinaryPath)
|
||||
if err != nil {
|
||||
cleanup(network, paths, nil, 0)
|
||||
return nil, err
|
||||
}
|
||||
firecrackerPID, err := waitForPIDFile(ctx, paths.PIDFilePath)
|
||||
if err != nil {
|
||||
cleanup(network, paths, command, 0)
|
||||
return nil, fmt.Errorf("wait for firecracker pid: %w", err)
|
||||
}
|
||||
|
||||
socketPath := procSocketPath(firecrackerPID)
|
||||
client := newAPIClient(socketPath)
|
||||
if err := waitForSocket(ctx, client, socketPath); err != nil {
|
||||
cleanup(network, paths, command, firecrackerPID)
|
||||
return nil, fmt.Errorf("wait for firecracker socket: %w", err)
|
||||
}
|
||||
|
||||
// Stage snapshot files and disk images into the chroot
|
||||
chrootMemPath, err := stageSnapshotFile(loadSpec.MemFilePath, paths.ChrootRootDir, "memory.bin")
|
||||
if err != nil {
|
||||
cleanup(network, paths, command, firecrackerPID)
|
||||
return nil, fmt.Errorf("stage memory file: %w", err)
|
||||
}
|
||||
chrootStatePath, err := stageSnapshotFile(loadSpec.SnapshotPath, paths.ChrootRootDir, "vmstate.bin")
|
||||
if err != nil {
|
||||
cleanup(network, paths, command, firecrackerPID)
|
||||
return nil, fmt.Errorf("stage vmstate file: %w", err)
|
||||
}
|
||||
|
||||
// Stage root filesystem
|
||||
rootFSName, err := stagedFileName(loadSpec.RootFSPath)
|
||||
if err != nil {
|
||||
cleanup(network, paths, command, firecrackerPID)
|
||||
return nil, fmt.Errorf("rootfs path: %w", err)
|
||||
}
|
||||
if err := linkMachineFile(loadSpec.RootFSPath, filepath.Join(paths.ChrootRootDir, rootFSName)); err != nil {
|
||||
cleanup(network, paths, command, firecrackerPID)
|
||||
return nil, fmt.Errorf("link rootfs into jail: %w", err)
|
||||
}
|
||||
|
||||
// Stage additional drives
|
||||
for driveID, drivePath := range loadSpec.DiskPaths {
|
||||
driveName, err := stagedFileName(drivePath)
|
||||
if err != nil {
|
||||
cleanup(network, paths, command, firecrackerPID)
|
||||
return nil, fmt.Errorf("drive %q path: %w", driveID, err)
|
||||
}
|
||||
if err := linkMachineFile(drivePath, filepath.Join(paths.ChrootRootDir, driveName)); err != nil {
|
||||
cleanup(network, paths, command, firecrackerPID)
|
||||
return nil, fmt.Errorf("link drive %q into jail: %w", driveID, err)
|
||||
}
|
||||
}
|
||||
|
||||
// Load snapshot (replaces the full configure+start sequence)
|
||||
if err := client.PutSnapshotLoad(ctx, SnapshotLoadParams{
|
||||
SnapshotPath: chrootStatePath,
|
||||
MemBackend: &MemBackend{
|
||||
BackendType: "File",
|
||||
BackendPath: chrootMemPath,
|
||||
},
|
||||
ResumeVm: false,
|
||||
NetworkOverrides: []NetworkOverride{
|
||||
{
|
||||
IfaceID: network.InterfaceID,
|
||||
HostDevName: network.TapName,
|
||||
},
|
||||
},
|
||||
}); err != nil {
|
||||
cleanup(network, paths, command, firecrackerPID)
|
||||
return nil, fmt.Errorf("load snapshot: %w", err)
|
||||
}
|
||||
|
||||
// Resume the restored VM
|
||||
if err := client.PatchVm(ctx, VmStateResumed); err != nil {
|
||||
cleanup(network, paths, command, firecrackerPID)
|
||||
return nil, fmt.Errorf("resume restored vm: %w", err)
|
||||
}
|
||||
|
||||
now := time.Now().UTC()
|
||||
state := MachineState{
|
||||
ID: loadSpec.ID,
|
||||
Phase: PhaseRunning,
|
||||
PID: firecrackerPID,
|
||||
RuntimeHost: network.GuestIP().String(),
|
||||
SocketPath: socketPath,
|
||||
TapName: network.TapName,
|
||||
StartedAt: &now,
|
||||
}
|
||||
return &state, nil
|
||||
}
|
||||
|
||||
func processExists(pid int) bool {
|
||||
if pid < 1 {
|
||||
return false
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue