computer-host/internal/firecracker/runtime.go

301 lines
7 KiB
Go

package firecracker
import (
"context"
"errors"
"fmt"
"os"
"path/filepath"
"sync"
"time"
sdk "github.com/firecracker-microvm/firecracker-go-sdk"
)
// ErrMachineNotFound is returned when the runtime does not know a machine ID.
var ErrMachineNotFound = errors.New("machine not found")
// RuntimeConfig configures the host-local Firecracker runtime wrapper.
type RuntimeConfig struct {
RootDir string
FirecrackerBinaryPath string
JailerBinaryPath string
JailerUID int
JailerGID int
NumaNode int
NetworkCIDR string
NetworkAllocator *NetworkAllocator
NetworkProvisioner NetworkProvisioner
}
// Runtime manages local Firecracker machines on a single host.
type Runtime struct {
config RuntimeConfig
mu sync.RWMutex
machines map[MachineID]*managedMachine
newMachine func(context.Context, sdk.Config) (*sdk.Machine, error)
}
type managedMachine struct {
spec MachineSpec
network NetworkAllocation
paths machinePaths
machine *sdk.Machine
state MachineState
}
// NewRuntime creates a new host-local Firecracker runtime wrapper.
func NewRuntime(cfg RuntimeConfig) (*Runtime, error) {
cfg.RootDir = filepath.Clean(cfg.RootDir)
if cfg.RootDir == "." || cfg.RootDir == "" {
return nil, fmt.Errorf("runtime root dir is required")
}
if err := os.MkdirAll(cfg.RootDir, 0o755); err != nil {
return nil, fmt.Errorf("create runtime root dir %q: %w", cfg.RootDir, err)
}
if cfg.FirecrackerBinaryPath == "" {
cfg.FirecrackerBinaryPath = "firecracker"
}
if cfg.JailerBinaryPath == "" {
cfg.JailerBinaryPath = "jailer"
}
if cfg.JailerUID == 0 {
cfg.JailerUID = os.Getuid()
}
if cfg.JailerGID == 0 {
cfg.JailerGID = os.Getgid()
}
if cfg.NumaNode < 0 {
cfg.NumaNode = 0
}
if cfg.NetworkAllocator == nil {
allocator, err := NewNetworkAllocator(cfg.NetworkCIDR)
if err != nil {
return nil, err
}
cfg.NetworkAllocator = allocator
}
if cfg.NetworkProvisioner == nil {
cfg.NetworkProvisioner = NewIPTapProvisioner()
}
return &Runtime{
config: cfg,
machines: make(map[MachineID]*managedMachine),
newMachine: func(ctx context.Context, cfg sdk.Config) (*sdk.Machine, error) {
return sdk.NewMachine(ctx, cfg)
},
}, nil
}
// Boot provisions host resources and starts a new jailed Firecracker process.
func (r *Runtime) Boot(ctx context.Context, spec MachineSpec) (*MachineState, error) {
if err := spec.Validate(); err != nil {
return nil, err
}
r.mu.Lock()
if _, exists := r.machines[spec.ID]; exists {
r.mu.Unlock()
return nil, fmt.Errorf("machine %q already exists", spec.ID)
}
usedNetworks := make([]NetworkAllocation, 0, len(r.machines))
for _, machine := range r.machines {
if machine != nil {
usedNetworks = append(usedNetworks, machine.network)
}
}
r.machines[spec.ID] = &managedMachine{
spec: spec,
state: MachineState{
ID: spec.ID,
Phase: PhaseProvisioning,
},
}
r.mu.Unlock()
cleanup := func(network NetworkAllocation, paths machinePaths) {
_ = r.config.NetworkProvisioner.Remove(context.Background(), network)
if paths.BaseDir != "" {
_ = os.RemoveAll(paths.BaseDir)
}
r.mu.Lock()
delete(r.machines, spec.ID)
r.mu.Unlock()
}
network, err := r.config.NetworkAllocator.Allocate(usedNetworks)
if err != nil {
cleanup(NetworkAllocation{}, machinePaths{})
return nil, err
}
paths, err := buildMachinePaths(r.config.RootDir, spec.ID, r.config.FirecrackerBinaryPath)
if err != nil {
cleanup(network, machinePaths{})
return nil, err
}
if err := os.MkdirAll(paths.JailerBaseDir, 0o755); err != nil {
cleanup(network, paths)
return nil, fmt.Errorf("create machine jailer dir %q: %w", paths.JailerBaseDir, err)
}
if err := r.config.NetworkProvisioner.Ensure(ctx, network); err != nil {
cleanup(network, paths)
return nil, err
}
cfg, err := buildSDKConfig(spec, paths, network, r.config)
if err != nil {
cleanup(network, paths)
return nil, err
}
machine, err := r.newMachine(ctx, cfg)
if err != nil {
cleanup(network, paths)
return nil, fmt.Errorf("create firecracker machine: %w", err)
}
if err := machine.Start(ctx); err != nil {
cleanup(network, paths)
return nil, fmt.Errorf("start firecracker machine: %w", err)
}
pid, _ := machine.PID()
now := time.Now().UTC()
state := MachineState{
ID: spec.ID,
Phase: PhaseRunning,
PID: pid,
RuntimeHost: network.GuestIP().String(),
SocketPath: machine.Cfg.SocketPath,
TapName: network.TapName,
StartedAt: &now,
}
r.mu.Lock()
entry := r.machines[spec.ID]
entry.network = network
entry.paths = paths
entry.machine = machine
entry.state = state
r.mu.Unlock()
go r.watchMachine(spec.ID, machine)
out := state
return &out, nil
}
// Inspect returns the currently known state for a machine.
func (r *Runtime) Inspect(id MachineID) (*MachineState, error) {
r.mu.RLock()
entry, ok := r.machines[id]
r.mu.RUnlock()
if !ok || entry == nil {
return nil, ErrMachineNotFound
}
state := entry.state
if entry.machine != nil {
pid, err := entry.machine.PID()
if err != nil {
if state.Phase == PhaseRunning {
state.Phase = PhaseStopped
state.PID = 0
}
} else {
state.PID = pid
}
}
return &state, nil
}
// Stop terminates a running Firecracker process and updates local state.
func (r *Runtime) Stop(ctx context.Context, id MachineID) error {
r.mu.RLock()
entry, ok := r.machines[id]
r.mu.RUnlock()
if !ok || entry == nil {
return ErrMachineNotFound
}
if entry.machine == nil {
return fmt.Errorf("machine %q has no firecracker process", id)
}
errCh := make(chan error, 1)
go func() {
errCh <- entry.machine.StopVMM()
}()
select {
case err := <-errCh:
if err != nil {
return fmt.Errorf("stop machine %q: %w", id, err)
}
case <-ctx.Done():
return ctx.Err()
}
r.mu.Lock()
entry.state.Phase = PhaseStopped
entry.state.PID = 0
entry.state.Error = ""
r.mu.Unlock()
return nil
}
// Delete stops a machine if necessary and removes its local resources.
func (r *Runtime) Delete(ctx context.Context, id MachineID) error {
r.mu.RLock()
entry, ok := r.machines[id]
r.mu.RUnlock()
if !ok || entry == nil {
return ErrMachineNotFound
}
if entry.machine != nil {
if err := r.Stop(ctx, id); err != nil && !errors.Is(err, context.Canceled) {
return err
}
}
if err := r.config.NetworkProvisioner.Remove(ctx, entry.network); err != nil {
return err
}
if err := os.RemoveAll(entry.paths.BaseDir); err != nil {
return fmt.Errorf("remove machine dir %q: %w", entry.paths.BaseDir, err)
}
r.mu.Lock()
delete(r.machines, id)
r.mu.Unlock()
return nil
}
func (r *Runtime) watchMachine(id MachineID, machine *sdk.Machine) {
err := machine.Wait(context.Background())
r.mu.Lock()
defer r.mu.Unlock()
entry, ok := r.machines[id]
if !ok || entry == nil || entry.machine != machine {
return
}
entry.state.PID = 0
if err != nil {
entry.state.Phase = PhaseError
entry.state.Error = err.Error()
return
}
if entry.state.Phase != PhaseStopped {
entry.state.Phase = PhaseStopped
}
entry.state.Error = ""
}