mirror of
https://github.com/getcompanion-ai/computer-host.git
synced 2026-04-15 03:00:42 +00:00
feat: local first snapshot implementation end to end
This commit is contained in:
parent
fc21e897ea
commit
30282928f5
4 changed files with 279 additions and 60 deletions
|
|
@ -81,10 +81,15 @@ type UploadSnapshotResponse struct {
|
||||||
}
|
}
|
||||||
|
|
||||||
type RestoreSnapshotRequest struct {
|
type RestoreSnapshotRequest struct {
|
||||||
MachineID MachineID `json:"machine_id"`
|
MachineID MachineID `json:"machine_id"`
|
||||||
Artifact ArtifactRef `json:"artifact"`
|
Artifact ArtifactRef `json:"artifact"`
|
||||||
Snapshot DurableSnapshotSpec `json:"snapshot"`
|
LocalSnapshot *LocalSnapshotSpec `json:"local_snapshot,omitempty"`
|
||||||
GuestConfig *GuestConfig `json:"guest_config,omitempty"`
|
Snapshot *DurableSnapshotSpec `json:"snapshot,omitempty"`
|
||||||
|
GuestConfig *GuestConfig `json:"guest_config,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type LocalSnapshotSpec struct {
|
||||||
|
SnapshotID SnapshotID `json:"snapshot_id"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type DurableSnapshotSpec struct {
|
type DurableSnapshotSpec struct {
|
||||||
|
|
|
||||||
|
|
@ -422,7 +422,19 @@ func TestRestoreSnapshotFallsBackToLocalSnapshotNetwork(t *testing.T) {
|
||||||
stubGuestSSHPublicKeyReader(hostDaemon)
|
stubGuestSSHPublicKeyReader(hostDaemon)
|
||||||
hostDaemon.reconfigureGuestIdentity = func(context.Context, string, contracthost.MachineID, *contracthost.GuestConfig) error { return nil }
|
hostDaemon.reconfigureGuestIdentity = func(context.Context, string, contracthost.MachineID, *contracthost.GuestConfig) error { return nil }
|
||||||
|
|
||||||
artifactRef := contracthost.ArtifactRef{KernelImageURL: "kernel", RootFSURL: "rootfs"}
|
server := newRestoreArtifactServer(t, map[string][]byte{
|
||||||
|
"/kernel": []byte("kernel"),
|
||||||
|
"/rootfs": []byte("rootfs"),
|
||||||
|
"/memory": []byte("mem"),
|
||||||
|
"/vmstate": []byte("state"),
|
||||||
|
"/system": []byte("disk"),
|
||||||
|
})
|
||||||
|
defer server.Close()
|
||||||
|
|
||||||
|
artifactRef := contracthost.ArtifactRef{
|
||||||
|
KernelImageURL: server.URL + "/kernel",
|
||||||
|
RootFSURL: server.URL + "/rootfs",
|
||||||
|
}
|
||||||
kernelPath := filepath.Join(root, "artifact-kernel")
|
kernelPath := filepath.Join(root, "artifact-kernel")
|
||||||
if err := os.WriteFile(kernelPath, []byte("kernel"), 0o644); err != nil {
|
if err := os.WriteFile(kernelPath, []byte("kernel"), 0o644); err != nil {
|
||||||
t.Fatalf("write kernel: %v", err)
|
t.Fatalf("write kernel: %v", err)
|
||||||
|
|
@ -452,22 +464,13 @@ func TestRestoreSnapshotFallsBackToLocalSnapshotNetwork(t *testing.T) {
|
||||||
t.Fatalf("create snapshot: %v", err)
|
t.Fatalf("create snapshot: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
server := newRestoreArtifactServer(t, map[string][]byte{
|
|
||||||
"/kernel": []byte("kernel"),
|
|
||||||
"/rootfs": []byte("rootfs"),
|
|
||||||
"/memory": []byte("mem"),
|
|
||||||
"/vmstate": []byte("state"),
|
|
||||||
"/system": []byte("disk"),
|
|
||||||
})
|
|
||||||
defer server.Close()
|
|
||||||
|
|
||||||
response, err := hostDaemon.RestoreSnapshot(context.Background(), "snap1", contracthost.RestoreSnapshotRequest{
|
response, err := hostDaemon.RestoreSnapshot(context.Background(), "snap1", contracthost.RestoreSnapshotRequest{
|
||||||
MachineID: "restored",
|
MachineID: "restored",
|
||||||
Artifact: contracthost.ArtifactRef{
|
Artifact: contracthost.ArtifactRef{
|
||||||
KernelImageURL: server.URL + "/kernel",
|
KernelImageURL: server.URL + "/kernel",
|
||||||
RootFSURL: server.URL + "/rootfs",
|
RootFSURL: server.URL + "/rootfs",
|
||||||
},
|
},
|
||||||
Snapshot: contracthost.DurableSnapshotSpec{
|
Snapshot: &contracthost.DurableSnapshotSpec{
|
||||||
SnapshotID: "snap1",
|
SnapshotID: "snap1",
|
||||||
MachineID: "source",
|
MachineID: "source",
|
||||||
ImageID: "image-1",
|
ImageID: "image-1",
|
||||||
|
|
@ -510,6 +513,134 @@ func TestRestoreSnapshotFallsBackToLocalSnapshotNetwork(t *testing.T) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestRestoreSnapshotUsesLocalSnapshotArtifacts(t *testing.T) {
|
||||||
|
root := t.TempDir()
|
||||||
|
cfg := testConfig(root)
|
||||||
|
fileStore, err := store.NewFileStore(cfg.StatePath, cfg.OperationsPath)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("create file store: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
sshListener := listenTestPort(t, int(defaultSSHPort))
|
||||||
|
defer func() { _ = sshListener.Close() }()
|
||||||
|
vncListener := listenTestPort(t, int(defaultVNCPort))
|
||||||
|
defer func() { _ = vncListener.Close() }()
|
||||||
|
|
||||||
|
startedAt := time.Unix(1700000199, 0).UTC()
|
||||||
|
runtime := &fakeRuntime{
|
||||||
|
bootState: firecracker.MachineState{
|
||||||
|
ID: "restored-local",
|
||||||
|
Phase: firecracker.PhaseRunning,
|
||||||
|
PID: 1234,
|
||||||
|
RuntimeHost: "127.0.0.1",
|
||||||
|
SocketPath: filepath.Join(cfg.RuntimeDir, "machines", "restored-local", "root", "run", "firecracker.sock"),
|
||||||
|
TapName: "fctap0",
|
||||||
|
StartedAt: &startedAt,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
hostDaemon, err := New(cfg, fileStore, runtime)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("create daemon: %v", err)
|
||||||
|
}
|
||||||
|
stubGuestSSHPublicKeyReader(hostDaemon)
|
||||||
|
hostDaemon.reconfigureGuestIdentity = func(context.Context, string, contracthost.MachineID, *contracthost.GuestConfig) error { return nil }
|
||||||
|
|
||||||
|
server := newRestoreArtifactServer(t, map[string][]byte{
|
||||||
|
"/kernel": []byte("kernel"),
|
||||||
|
"/rootfs": []byte("rootfs"),
|
||||||
|
})
|
||||||
|
defer server.Close()
|
||||||
|
|
||||||
|
artifactRef := contracthost.ArtifactRef{
|
||||||
|
KernelImageURL: server.URL + "/kernel",
|
||||||
|
RootFSURL: server.URL + "/rootfs",
|
||||||
|
}
|
||||||
|
artifactDir := filepath.Join(root, "artifact")
|
||||||
|
if err := os.MkdirAll(artifactDir, 0o755); err != nil {
|
||||||
|
t.Fatalf("create artifact dir: %v", err)
|
||||||
|
}
|
||||||
|
kernelPath := filepath.Join(artifactDir, "vmlinux")
|
||||||
|
rootFSPath := filepath.Join(artifactDir, "rootfs.ext4")
|
||||||
|
if err := os.WriteFile(kernelPath, []byte("kernel"), 0o644); err != nil {
|
||||||
|
t.Fatalf("write kernel: %v", err)
|
||||||
|
}
|
||||||
|
if err := os.WriteFile(rootFSPath, []byte("rootfs"), 0o644); err != nil {
|
||||||
|
t.Fatalf("write rootfs: %v", err)
|
||||||
|
}
|
||||||
|
if err := fileStore.PutArtifact(context.Background(), model.ArtifactRecord{
|
||||||
|
Ref: artifactRef,
|
||||||
|
LocalKey: "artifact",
|
||||||
|
LocalDir: artifactDir,
|
||||||
|
KernelImagePath: kernelPath,
|
||||||
|
RootFSPath: rootFSPath,
|
||||||
|
CreatedAt: time.Now().UTC(),
|
||||||
|
}); err != nil {
|
||||||
|
t.Fatalf("put artifact: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
snapshotDir := filepath.Join(root, "snapshots", "snap-local")
|
||||||
|
if err := os.MkdirAll(snapshotDir, 0o755); err != nil {
|
||||||
|
t.Fatalf("create snapshot dir: %v", err)
|
||||||
|
}
|
||||||
|
memoryPath := filepath.Join(snapshotDir, "memory.bin")
|
||||||
|
vmstatePath := filepath.Join(snapshotDir, "vmstate.bin")
|
||||||
|
systemPath := filepath.Join(snapshotDir, "system.img")
|
||||||
|
if err := os.WriteFile(memoryPath, []byte("mem"), 0o644); err != nil {
|
||||||
|
t.Fatalf("write memory: %v", err)
|
||||||
|
}
|
||||||
|
if err := os.WriteFile(vmstatePath, []byte("state"), 0o644); err != nil {
|
||||||
|
t.Fatalf("write vmstate: %v", err)
|
||||||
|
}
|
||||||
|
if err := os.WriteFile(systemPath, []byte("disk"), 0o644); err != nil {
|
||||||
|
t.Fatalf("write system disk: %v", err)
|
||||||
|
}
|
||||||
|
if err := fileStore.CreateSnapshot(context.Background(), model.SnapshotRecord{
|
||||||
|
ID: "snap-local",
|
||||||
|
MachineID: "source",
|
||||||
|
Artifact: artifactRef,
|
||||||
|
MemFilePath: memoryPath,
|
||||||
|
StateFilePath: vmstatePath,
|
||||||
|
DiskPaths: []string{systemPath},
|
||||||
|
Artifacts: []model.SnapshotArtifactRecord{
|
||||||
|
{ID: "memory", Kind: contracthost.SnapshotArtifactKindMemory, Name: "memory.bin", LocalPath: memoryPath, SizeBytes: 3},
|
||||||
|
{ID: "vmstate", Kind: contracthost.SnapshotArtifactKindVMState, Name: "vmstate.bin", LocalPath: vmstatePath, SizeBytes: 5},
|
||||||
|
{ID: "disk-system", Kind: contracthost.SnapshotArtifactKindDisk, Name: "system.img", LocalPath: systemPath, SizeBytes: 4},
|
||||||
|
},
|
||||||
|
SourceRuntimeHost: "172.16.0.2",
|
||||||
|
SourceTapDevice: "fctap0",
|
||||||
|
CreatedAt: time.Now().UTC(),
|
||||||
|
}); err != nil {
|
||||||
|
t.Fatalf("create snapshot: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
response, err := hostDaemon.RestoreSnapshot(context.Background(), "snap-local", contracthost.RestoreSnapshotRequest{
|
||||||
|
MachineID: "restored-local",
|
||||||
|
Artifact: artifactRef,
|
||||||
|
LocalSnapshot: &contracthost.LocalSnapshotSpec{
|
||||||
|
SnapshotID: "snap-local",
|
||||||
|
},
|
||||||
|
GuestConfig: &contracthost.GuestConfig{Hostname: "restored-local-shell"},
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("restore snapshot: %v", err)
|
||||||
|
}
|
||||||
|
if response.Machine.ID != "restored-local" {
|
||||||
|
t.Fatalf("restored machine id mismatch: got %q", response.Machine.ID)
|
||||||
|
}
|
||||||
|
if runtime.restoreCalls != 1 {
|
||||||
|
t.Fatalf("restore boot call count mismatch: got %d want 1", runtime.restoreCalls)
|
||||||
|
}
|
||||||
|
if runtime.lastLoadSpec.Network == nil {
|
||||||
|
t.Fatalf("restore boot should preserve local snapshot network")
|
||||||
|
}
|
||||||
|
if got := runtime.lastLoadSpec.Network.GuestIP().String(); got != "172.16.0.2" {
|
||||||
|
t.Fatalf("restore guest ip mismatch: got %q want %q", got, "172.16.0.2")
|
||||||
|
}
|
||||||
|
if got := runtime.lastLoadSpec.Network.TapName; got != "fctap0" {
|
||||||
|
t.Fatalf("restore tap mismatch: got %q want %q", got, "fctap0")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestRestoreSnapshotUsesDurableSnapshotSpec(t *testing.T) {
|
func TestRestoreSnapshotUsesDurableSnapshotSpec(t *testing.T) {
|
||||||
root := t.TempDir()
|
root := t.TempDir()
|
||||||
cfg := testConfig(root)
|
cfg := testConfig(root)
|
||||||
|
|
@ -565,7 +696,7 @@ func TestRestoreSnapshotUsesDurableSnapshotSpec(t *testing.T) {
|
||||||
KernelImageURL: server.URL + "/kernel",
|
KernelImageURL: server.URL + "/kernel",
|
||||||
RootFSURL: server.URL + "/rootfs",
|
RootFSURL: server.URL + "/rootfs",
|
||||||
},
|
},
|
||||||
Snapshot: contracthost.DurableSnapshotSpec{
|
Snapshot: &contracthost.DurableSnapshotSpec{
|
||||||
SnapshotID: "snap1",
|
SnapshotID: "snap1",
|
||||||
MachineID: "source",
|
MachineID: "source",
|
||||||
ImageID: "image-1",
|
ImageID: "image-1",
|
||||||
|
|
@ -666,7 +797,7 @@ func TestRestoreSnapshotRejectsWhenRestoreNetworkInUseOnHost(t *testing.T) {
|
||||||
KernelImageURL: "https://example.com/kernel",
|
KernelImageURL: "https://example.com/kernel",
|
||||||
RootFSURL: "https://example.com/rootfs",
|
RootFSURL: "https://example.com/rootfs",
|
||||||
},
|
},
|
||||||
Snapshot: contracthost.DurableSnapshotSpec{
|
Snapshot: &contracthost.DurableSnapshotSpec{
|
||||||
SnapshotID: "snap1",
|
SnapshotID: "snap1",
|
||||||
MachineID: "source",
|
MachineID: "source",
|
||||||
ImageID: "image-1",
|
ImageID: "image-1",
|
||||||
|
|
|
||||||
|
|
@ -14,10 +14,10 @@ import (
|
||||||
"testing"
|
"testing"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
contracthost "github.com/getcompanion-ai/computer-host/contract"
|
|
||||||
"github.com/getcompanion-ai/computer-host/internal/firecracker"
|
"github.com/getcompanion-ai/computer-host/internal/firecracker"
|
||||||
"github.com/getcompanion-ai/computer-host/internal/model"
|
"github.com/getcompanion-ai/computer-host/internal/model"
|
||||||
hoststore "github.com/getcompanion-ai/computer-host/internal/store"
|
hoststore "github.com/getcompanion-ai/computer-host/internal/store"
|
||||||
|
contracthost "github.com/getcompanion-ai/computer-host/contract"
|
||||||
)
|
)
|
||||||
|
|
||||||
type blockingPublishedPortStore struct {
|
type blockingPublishedPortStore struct {
|
||||||
|
|
@ -509,7 +509,7 @@ func TestRestoreSnapshotTransitionsToStartingWithoutRelayAllocation(t *testing.T
|
||||||
KernelImageURL: server.URL + "/kernel",
|
KernelImageURL: server.URL + "/kernel",
|
||||||
RootFSURL: server.URL + "/rootfs",
|
RootFSURL: server.URL + "/rootfs",
|
||||||
},
|
},
|
||||||
Snapshot: contracthost.DurableSnapshotSpec{
|
Snapshot: &contracthost.DurableSnapshotSpec{
|
||||||
SnapshotID: "snap-exhausted",
|
SnapshotID: "snap-exhausted",
|
||||||
MachineID: "source",
|
MachineID: "source",
|
||||||
ImageID: "image-1",
|
ImageID: "image-1",
|
||||||
|
|
@ -708,7 +708,7 @@ func TestRestoreSnapshotCleansStagingArtifactsAfterSuccess(t *testing.T) {
|
||||||
KernelImageURL: server.URL + "/kernel",
|
KernelImageURL: server.URL + "/kernel",
|
||||||
RootFSURL: server.URL + "/rootfs",
|
RootFSURL: server.URL + "/rootfs",
|
||||||
},
|
},
|
||||||
Snapshot: contracthost.DurableSnapshotSpec{
|
Snapshot: &contracthost.DurableSnapshotSpec{
|
||||||
SnapshotID: "snap-clean",
|
SnapshotID: "snap-clean",
|
||||||
MachineID: "source",
|
MachineID: "source",
|
||||||
ImageID: "image-1",
|
ImageID: "image-1",
|
||||||
|
|
@ -759,7 +759,7 @@ func TestRestoreSnapshotCleansStagingArtifactsAfterDownloadFailure(t *testing.T)
|
||||||
KernelImageURL: server.URL + "/kernel",
|
KernelImageURL: server.URL + "/kernel",
|
||||||
RootFSURL: server.URL + "/rootfs",
|
RootFSURL: server.URL + "/rootfs",
|
||||||
},
|
},
|
||||||
Snapshot: contracthost.DurableSnapshotSpec{
|
Snapshot: &contracthost.DurableSnapshotSpec{
|
||||||
SnapshotID: "snap-fail-clean",
|
SnapshotID: "snap-fail-clean",
|
||||||
MachineID: "source",
|
MachineID: "source",
|
||||||
ImageID: "image-1",
|
ImageID: "image-1",
|
||||||
|
|
|
||||||
|
|
@ -11,12 +11,16 @@ import (
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
"golang.org/x/sync/errgroup"
|
||||||
|
|
||||||
"github.com/getcompanion-ai/computer-host/internal/firecracker"
|
"github.com/getcompanion-ai/computer-host/internal/firecracker"
|
||||||
"github.com/getcompanion-ai/computer-host/internal/model"
|
"github.com/getcompanion-ai/computer-host/internal/model"
|
||||||
"github.com/getcompanion-ai/computer-host/internal/store"
|
"github.com/getcompanion-ai/computer-host/internal/store"
|
||||||
contracthost "github.com/getcompanion-ai/computer-host/contract"
|
contracthost "github.com/getcompanion-ai/computer-host/contract"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
const localSnapshotRestoreUnavailablePrefix = "local snapshot restore unavailable"
|
||||||
|
|
||||||
func (d *Daemon) CreateSnapshot(ctx context.Context, machineID contracthost.MachineID, req contracthost.CreateSnapshotRequest) (*contracthost.CreateSnapshotResponse, error) {
|
func (d *Daemon) CreateSnapshot(ctx context.Context, machineID contracthost.MachineID, req contracthost.CreateSnapshotRequest) (*contracthost.CreateSnapshotResponse, error) {
|
||||||
unlock := d.lockMachine(machineID)
|
unlock := d.lockMachine(machineID)
|
||||||
defer unlock()
|
defer unlock()
|
||||||
|
|
@ -193,20 +197,31 @@ func (d *Daemon) UploadSnapshot(ctx context.Context, snapshotID contracthost.Sna
|
||||||
response := &contracthost.UploadSnapshotResponse{
|
response := &contracthost.UploadSnapshotResponse{
|
||||||
Artifacts: make([]contracthost.UploadedSnapshotArtifact, 0, len(req.Artifacts)),
|
Artifacts: make([]contracthost.UploadedSnapshotArtifact, 0, len(req.Artifacts)),
|
||||||
}
|
}
|
||||||
for _, upload := range req.Artifacts {
|
uploads := make([]contracthost.UploadedSnapshotArtifact, len(req.Artifacts))
|
||||||
artifact, ok := artifactIndex[upload.ArtifactID]
|
group, groupCtx := errgroup.WithContext(ctx)
|
||||||
if !ok {
|
for i, upload := range req.Artifacts {
|
||||||
return nil, fmt.Errorf("snapshot %q artifact %q not found", snapshotID, upload.ArtifactID)
|
i := i
|
||||||
}
|
upload := upload
|
||||||
completedParts, err := uploadSnapshotArtifact(ctx, artifact.LocalPath, upload.Parts)
|
group.Go(func() error {
|
||||||
if err != nil {
|
artifact, ok := artifactIndex[upload.ArtifactID]
|
||||||
return nil, fmt.Errorf("upload snapshot artifact %q: %w", upload.ArtifactID, err)
|
if !ok {
|
||||||
}
|
return fmt.Errorf("snapshot %q artifact %q not found", snapshotID, upload.ArtifactID)
|
||||||
response.Artifacts = append(response.Artifacts, contracthost.UploadedSnapshotArtifact{
|
}
|
||||||
ArtifactID: upload.ArtifactID,
|
completedParts, err := uploadSnapshotArtifact(groupCtx, artifact.LocalPath, upload.Parts)
|
||||||
CompletedParts: completedParts,
|
if err != nil {
|
||||||
|
return fmt.Errorf("upload snapshot artifact %q: %w", upload.ArtifactID, err)
|
||||||
|
}
|
||||||
|
uploads[i] = contracthost.UploadedSnapshotArtifact{
|
||||||
|
ArtifactID: upload.ArtifactID,
|
||||||
|
CompletedParts: completedParts,
|
||||||
|
}
|
||||||
|
return nil
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
if err := group.Wait(); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
response.Artifacts = append(response.Artifacts, uploads...)
|
||||||
|
|
||||||
return response, nil
|
return response, nil
|
||||||
}
|
}
|
||||||
|
|
@ -215,12 +230,18 @@ func (d *Daemon) RestoreSnapshot(ctx context.Context, snapshotID contracthost.Sn
|
||||||
if err := validateMachineID(req.MachineID); err != nil {
|
if err := validateMachineID(req.MachineID); err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
if req.Snapshot.SnapshotID != "" && req.Snapshot.SnapshotID != snapshotID {
|
|
||||||
return nil, fmt.Errorf("snapshot id mismatch: path=%q payload=%q", snapshotID, req.Snapshot.SnapshotID)
|
|
||||||
}
|
|
||||||
if err := validateArtifactRef(req.Artifact); err != nil {
|
if err := validateArtifactRef(req.Artifact); err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
if req.LocalSnapshot == nil && req.Snapshot == nil {
|
||||||
|
return nil, fmt.Errorf("restore request must include local_snapshot or snapshot")
|
||||||
|
}
|
||||||
|
if req.LocalSnapshot != nil && req.LocalSnapshot.SnapshotID != "" && req.LocalSnapshot.SnapshotID != snapshotID {
|
||||||
|
return nil, fmt.Errorf("local snapshot id mismatch: path=%q payload=%q", snapshotID, req.LocalSnapshot.SnapshotID)
|
||||||
|
}
|
||||||
|
if req.Snapshot != nil && req.Snapshot.SnapshotID != "" && req.Snapshot.SnapshotID != snapshotID {
|
||||||
|
return nil, fmt.Errorf("snapshot id mismatch: path=%q payload=%q", snapshotID, req.Snapshot.SnapshotID)
|
||||||
|
}
|
||||||
if err := validateGuestConfig(req.GuestConfig); err != nil {
|
if err := validateGuestConfig(req.GuestConfig); err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
@ -258,30 +279,18 @@ func (d *Daemon) RestoreSnapshot(ctx context.Context, snapshotID contracthost.Sn
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
restoreNetwork, err := d.resolveRestoreNetwork(ctx, snapshotID, req.Snapshot)
|
restoredArtifacts, restoreNetwork, cleanupRestoreArtifacts, err := d.prepareRestoreArtifacts(ctx, snapshotID, req, usedNetworks)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
clearOperation = true
|
clearOperation = true
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
if networkAllocationInUse(restoreNetwork, usedNetworks) {
|
defer cleanupRestoreArtifacts()
|
||||||
clearOperation = true
|
|
||||||
return nil, fmt.Errorf("restore network for snapshot %q is still in use on this host (runtime_host=%s tap_device=%s)", snapshotID, restoreNetwork.GuestIP(), restoreNetwork.TapName)
|
|
||||||
}
|
|
||||||
|
|
||||||
artifact, err := d.ensureArtifact(ctx, req.Artifact)
|
artifact, err := d.ensureArtifact(ctx, req.Artifact)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
clearOperation = true
|
||||||
return nil, fmt.Errorf("ensure artifact for restore: %w", err)
|
return nil, fmt.Errorf("ensure artifact for restore: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
stagingDir := filepath.Join(d.config.SnapshotsDir, string(snapshotID), "restores", string(req.MachineID))
|
|
||||||
restoredArtifacts, err := downloadDurableSnapshotArtifacts(ctx, stagingDir, req.Snapshot.Artifacts)
|
|
||||||
if err != nil {
|
|
||||||
_ = os.RemoveAll(stagingDir)
|
|
||||||
clearOperation = true
|
|
||||||
return nil, fmt.Errorf("download durable snapshot artifacts: %w", err)
|
|
||||||
}
|
|
||||||
defer func() { _ = os.RemoveAll(stagingDir) }()
|
|
||||||
|
|
||||||
// COW-copy system disk from snapshot to new machine's disk dir.
|
// COW-copy system disk from snapshot to new machine's disk dir.
|
||||||
newSystemDiskPath := d.systemVolumePath(req.MachineID)
|
newSystemDiskPath := d.systemVolumePath(req.MachineID)
|
||||||
if err := os.MkdirAll(filepath.Dir(newSystemDiskPath), 0o755); err != nil {
|
if err := os.MkdirAll(filepath.Dir(newSystemDiskPath), 0o755); err != nil {
|
||||||
|
|
@ -515,19 +524,59 @@ func restoredUserDiskIndex(name string) (int, bool) {
|
||||||
return index, true
|
return index, true
|
||||||
}
|
}
|
||||||
|
|
||||||
func (d *Daemon) resolveRestoreNetwork(ctx context.Context, snapshotID contracthost.SnapshotID, spec contracthost.DurableSnapshotSpec) (firecracker.NetworkAllocation, error) {
|
func (d *Daemon) prepareRestoreArtifacts(ctx context.Context, snapshotID contracthost.SnapshotID, req contracthost.RestoreSnapshotRequest, usedNetworks []firecracker.NetworkAllocation) (map[string]restoredSnapshotArtifact, firecracker.NetworkAllocation, func(), error) {
|
||||||
if network, err := restoreNetworkFromDurableSpec(spec); err == nil {
|
if req.LocalSnapshot != nil {
|
||||||
return network, nil
|
if req.LocalSnapshot.SnapshotID != "" && req.LocalSnapshot.SnapshotID != snapshotID {
|
||||||
|
return nil, firecracker.NetworkAllocation{}, func() {}, fmt.Errorf("local snapshot id mismatch: path=%q payload=%q", snapshotID, req.LocalSnapshot.SnapshotID)
|
||||||
|
}
|
||||||
|
snapshot, err := d.store.GetSnapshot(ctx, snapshotID)
|
||||||
|
if err != nil {
|
||||||
|
if err == store.ErrNotFound {
|
||||||
|
return nil, firecracker.NetworkAllocation{}, func() {}, localSnapshotRestoreUnavailable(snapshotID, "snapshot is not present on this host")
|
||||||
|
}
|
||||||
|
return nil, firecracker.NetworkAllocation{}, func() {}, err
|
||||||
|
}
|
||||||
|
restoreNetwork, err := restoreNetworkFromSnapshot(snapshot)
|
||||||
|
if err != nil {
|
||||||
|
return nil, firecracker.NetworkAllocation{}, func() {}, localSnapshotRestoreUnavailable(snapshotID, err.Error())
|
||||||
|
}
|
||||||
|
if networkAllocationInUse(restoreNetwork, usedNetworks) {
|
||||||
|
return nil, firecracker.NetworkAllocation{}, func() {}, localSnapshotRestoreUnavailable(snapshotID, fmt.Sprintf("restore network is still in use on this host (runtime_host=%s tap_device=%s)", restoreNetwork.GuestIP(), restoreNetwork.TapName))
|
||||||
|
}
|
||||||
|
artifacts, err := localSnapshotArtifacts(snapshot)
|
||||||
|
if err != nil {
|
||||||
|
return nil, firecracker.NetworkAllocation{}, func() {}, localSnapshotRestoreUnavailable(snapshotID, err.Error())
|
||||||
|
}
|
||||||
|
return artifacts, restoreNetwork, func() {}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
snapshot, err := d.store.GetSnapshot(ctx, snapshotID)
|
if req.Snapshot == nil {
|
||||||
if err == nil {
|
return nil, firecracker.NetworkAllocation{}, func() {}, fmt.Errorf("durable snapshot spec is required")
|
||||||
return restoreNetworkFromSnapshot(snapshot)
|
|
||||||
}
|
}
|
||||||
if err != store.ErrNotFound {
|
restoreNetwork, err := restoreNetworkFromDurableSpec(*req.Snapshot)
|
||||||
return firecracker.NetworkAllocation{}, err
|
if err != nil {
|
||||||
|
snapshot, lookupErr := d.store.GetSnapshot(ctx, snapshotID)
|
||||||
|
if lookupErr == nil {
|
||||||
|
restoreNetwork, err = restoreNetworkFromSnapshot(snapshot)
|
||||||
|
} else if lookupErr != store.ErrNotFound {
|
||||||
|
return nil, firecracker.NetworkAllocation{}, func() {}, lookupErr
|
||||||
|
}
|
||||||
|
if err != nil {
|
||||||
|
return nil, firecracker.NetworkAllocation{}, func() {}, err
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return firecracker.NetworkAllocation{}, fmt.Errorf("snapshot %q is missing restore network metadata", snapshotID)
|
if networkAllocationInUse(restoreNetwork, usedNetworks) {
|
||||||
|
return nil, firecracker.NetworkAllocation{}, func() {}, fmt.Errorf("restore network for snapshot %q is still in use on this host (runtime_host=%s tap_device=%s)", snapshotID, restoreNetwork.GuestIP(), restoreNetwork.TapName)
|
||||||
|
}
|
||||||
|
stagingDir := filepath.Join(d.config.SnapshotsDir, string(snapshotID), "restores", string(req.MachineID))
|
||||||
|
artifacts, err := downloadDurableSnapshotArtifacts(ctx, stagingDir, req.Snapshot.Artifacts)
|
||||||
|
if err != nil {
|
||||||
|
_ = os.RemoveAll(stagingDir)
|
||||||
|
return nil, firecracker.NetworkAllocation{}, func() {}, fmt.Errorf("download durable snapshot artifacts: %w", err)
|
||||||
|
}
|
||||||
|
return artifacts, restoreNetwork, func() {
|
||||||
|
_ = os.RemoveAll(stagingDir)
|
||||||
|
}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func restoreNetworkFromDurableSpec(spec contracthost.DurableSnapshotSpec) (firecracker.NetworkAllocation, error) {
|
func restoreNetworkFromDurableSpec(spec contracthost.DurableSnapshotSpec) (firecracker.NetworkAllocation, error) {
|
||||||
|
|
@ -555,6 +604,32 @@ func restoreNetworkFromSnapshot(snap *model.SnapshotRecord) (firecracker.Network
|
||||||
return network, nil
|
return network, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func localSnapshotArtifacts(snapshot *model.SnapshotRecord) (map[string]restoredSnapshotArtifact, error) {
|
||||||
|
if snapshot == nil {
|
||||||
|
return nil, fmt.Errorf("snapshot is required")
|
||||||
|
}
|
||||||
|
restored := make(map[string]restoredSnapshotArtifact, len(snapshot.Artifacts))
|
||||||
|
for _, artifact := range snapshot.Artifacts {
|
||||||
|
if strings.TrimSpace(artifact.LocalPath) == "" {
|
||||||
|
return nil, fmt.Errorf("snapshot %q artifact %q is missing a local path", snapshot.ID, artifact.ID)
|
||||||
|
}
|
||||||
|
if _, err := os.Stat(artifact.LocalPath); err != nil {
|
||||||
|
return nil, fmt.Errorf("snapshot %q artifact %q is unavailable at %q: %w", snapshot.ID, artifact.ID, artifact.LocalPath, err)
|
||||||
|
}
|
||||||
|
restored[artifact.Name] = restoredSnapshotArtifact{
|
||||||
|
Artifact: contracthost.SnapshotArtifact{
|
||||||
|
ID: artifact.ID,
|
||||||
|
Kind: artifact.Kind,
|
||||||
|
Name: artifact.Name,
|
||||||
|
SizeBytes: artifact.SizeBytes,
|
||||||
|
SHA256Hex: artifact.SHA256Hex,
|
||||||
|
},
|
||||||
|
LocalPath: artifact.LocalPath,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return restored, nil
|
||||||
|
}
|
||||||
|
|
||||||
func networkAllocationInUse(target firecracker.NetworkAllocation, used []firecracker.NetworkAllocation) bool {
|
func networkAllocationInUse(target firecracker.NetworkAllocation, used []firecracker.NetworkAllocation) bool {
|
||||||
targetTap := strings.TrimSpace(target.TapName)
|
targetTap := strings.TrimSpace(target.TapName)
|
||||||
for _, network := range used {
|
for _, network := range used {
|
||||||
|
|
@ -568,6 +643,14 @@ func networkAllocationInUse(target firecracker.NetworkAllocation, used []firecra
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func localSnapshotRestoreUnavailable(snapshotID contracthost.SnapshotID, message string) error {
|
||||||
|
message = strings.TrimSpace(message)
|
||||||
|
if message == "" {
|
||||||
|
message = "local restore is unavailable"
|
||||||
|
}
|
||||||
|
return fmt.Errorf("%s: snapshot %q %s", localSnapshotRestoreUnavailablePrefix, snapshotID, message)
|
||||||
|
}
|
||||||
|
|
||||||
// moveFile copies src to dst then removes src. Works across filesystem boundaries
|
// moveFile copies src to dst then removes src. Works across filesystem boundaries
|
||||||
// unlike os.Rename, which is needed when moving files out of /proc/<pid>/root/.
|
// unlike os.Rename, which is needed when moving files out of /proc/<pid>/root/.
|
||||||
func moveFile(src, dst string) error {
|
func moveFile(src, dst string) error {
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue