mirror of
https://github.com/getcompanion-ai/computer-host.git
synced 2026-04-15 14:03:47 +00:00
host daemon (#2)
* feat: host daemon api scaffold * fix: use sparse writes * fix: unix socket length (<108 bytes)
This commit is contained in:
parent
4028bb5a1d
commit
e2f9e54970
21 changed files with 2111 additions and 372 deletions
|
|
@ -8,7 +8,6 @@ import (
|
|||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"sync"
|
||||
"syscall"
|
||||
"time"
|
||||
)
|
||||
|
|
@ -27,26 +26,9 @@ type Runtime struct {
|
|||
jailerBinaryPath string
|
||||
networkAllocator *NetworkAllocator
|
||||
networkProvisioner NetworkProvisioner
|
||||
|
||||
mu sync.RWMutex
|
||||
machines map[MachineID]*managedMachine
|
||||
}
|
||||
|
||||
type managedMachine struct {
|
||||
cmd *exec.Cmd
|
||||
entered bool
|
||||
exited chan struct{}
|
||||
network NetworkAllocation
|
||||
paths machinePaths
|
||||
spec MachineSpec
|
||||
state MachineState
|
||||
stopping bool
|
||||
}
|
||||
|
||||
const (
|
||||
defaultVSockCIDStart = uint32(3)
|
||||
defaultVSockID = "vsock0"
|
||||
)
|
||||
const debugPreserveFailureEnv = "FIRECRACKER_DEBUG_PRESERVE_FAILURE"
|
||||
|
||||
func NewRuntime(cfg RuntimeConfig) (*Runtime, error) {
|
||||
rootDir := filepath.Clean(strings.TrimSpace(cfg.RootDir))
|
||||
|
|
@ -79,64 +61,28 @@ func NewRuntime(cfg RuntimeConfig) (*Runtime, error) {
|
|||
jailerBinaryPath: jailerBinaryPath,
|
||||
networkAllocator: allocator,
|
||||
networkProvisioner: NewIPTapProvisioner(),
|
||||
machines: make(map[MachineID]*managedMachine),
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (r *Runtime) Boot(ctx context.Context, spec MachineSpec) (*MachineState, error) {
|
||||
func (r *Runtime) Boot(ctx context.Context, spec MachineSpec, usedNetworks []NetworkAllocation) (*MachineState, error) {
|
||||
if err := spec.Validate(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
r.mu.Lock()
|
||||
if _, exists := r.machines[spec.ID]; exists {
|
||||
r.mu.Unlock()
|
||||
return nil, fmt.Errorf("machine %q already exists", spec.ID)
|
||||
}
|
||||
|
||||
usedNetworks := make([]NetworkAllocation, 0, len(r.machines))
|
||||
usedVSockCIDs := make(map[uint32]struct{}, len(r.machines))
|
||||
for _, machine := range r.machines {
|
||||
if machine == nil {
|
||||
continue
|
||||
}
|
||||
usedNetworks = append(usedNetworks, machine.network)
|
||||
if machine.spec.Vsock != nil {
|
||||
usedVSockCIDs[machine.spec.Vsock.CID] = struct{}{}
|
||||
}
|
||||
}
|
||||
|
||||
spec, err := r.resolveVSock(spec, usedVSockCIDs)
|
||||
if err != nil {
|
||||
r.mu.Unlock()
|
||||
return nil, err
|
||||
}
|
||||
|
||||
r.machines[spec.ID] = &managedMachine{
|
||||
spec: spec,
|
||||
state: MachineState{
|
||||
ID: spec.ID,
|
||||
Phase: PhaseProvisioning,
|
||||
},
|
||||
entered: true,
|
||||
}
|
||||
r.mu.Unlock()
|
||||
|
||||
cleanup := func(network NetworkAllocation, paths machinePaths, command *exec.Cmd) {
|
||||
if preserveFailureArtifacts() {
|
||||
fmt.Fprintf(os.Stderr, "firecracker debug: preserving failure artifacts machine=%s pid=%d socket=%s base=%s\n", spec.ID, pidOf(command), paths.SocketPath, paths.BaseDir)
|
||||
return
|
||||
}
|
||||
cleanupStartedProcess(command)
|
||||
_ = r.networkProvisioner.Remove(context.Background(), network)
|
||||
_ = removeIfExists(hostVSockPath(paths, spec))
|
||||
if paths.BaseDir != "" {
|
||||
_ = os.RemoveAll(paths.BaseDir)
|
||||
}
|
||||
r.mu.Lock()
|
||||
delete(r.machines, spec.ID)
|
||||
r.mu.Unlock()
|
||||
}
|
||||
|
||||
network, err := r.networkAllocator.Allocate(usedNetworks)
|
||||
if err != nil {
|
||||
cleanup(NetworkAllocation{}, machinePaths{}, nil)
|
||||
return nil, err
|
||||
}
|
||||
|
||||
|
|
@ -159,9 +105,14 @@ func (r *Runtime) Boot(ctx context.Context, spec MachineSpec) (*MachineState, er
|
|||
cleanup(network, paths, nil)
|
||||
return nil, err
|
||||
}
|
||||
socketPath := paths.SocketPath
|
||||
if pid := pidOf(command); pid > 0 {
|
||||
socketPath = procSocketPath(pid)
|
||||
}
|
||||
fmt.Fprintf(os.Stderr, "firecracker debug: launched machine=%s pid=%d socket=%s jailer_base=%s\n", spec.ID, pidOf(command), socketPath, paths.JailerBaseDir)
|
||||
|
||||
client := newAPIClient(paths.SocketPath)
|
||||
if err := waitForSocket(ctx, client, paths.SocketPath); err != nil {
|
||||
client := newAPIClient(socketPath)
|
||||
if err := waitForSocket(ctx, client, socketPath); err != nil {
|
||||
cleanup(network, paths, command)
|
||||
return nil, fmt.Errorf("wait for firecracker socket: %w", err)
|
||||
}
|
||||
|
|
@ -187,162 +138,74 @@ func (r *Runtime) Boot(ctx context.Context, spec MachineSpec) (*MachineState, er
|
|||
Phase: PhaseRunning,
|
||||
PID: pid,
|
||||
RuntimeHost: network.GuestIP().String(),
|
||||
SocketPath: paths.SocketPath,
|
||||
SocketPath: socketPath,
|
||||
TapName: network.TapName,
|
||||
StartedAt: &now,
|
||||
}
|
||||
|
||||
r.mu.Lock()
|
||||
entry := r.machines[spec.ID]
|
||||
entry.cmd = command
|
||||
entry.exited = make(chan struct{})
|
||||
entry.network = network
|
||||
entry.paths = paths
|
||||
entry.state = state
|
||||
r.mu.Unlock()
|
||||
|
||||
go r.watchMachine(spec.ID, command, entry.exited)
|
||||
|
||||
out := state
|
||||
return &out, nil
|
||||
}
|
||||
|
||||
func (r *Runtime) Inspect(id MachineID) (*MachineState, error) {
|
||||
r.mu.RLock()
|
||||
entry, ok := r.machines[id]
|
||||
r.mu.RUnlock()
|
||||
if !ok || entry == nil {
|
||||
return nil, ErrMachineNotFound
|
||||
}
|
||||
|
||||
state := entry.state
|
||||
if state.PID > 0 && !processExists(state.PID) {
|
||||
state.Phase = PhaseStopped
|
||||
state.PID = 0
|
||||
}
|
||||
return &state, nil
|
||||
}
|
||||
|
||||
func (r *Runtime) Stop(ctx context.Context, id MachineID) error {
|
||||
r.mu.RLock()
|
||||
entry, ok := r.machines[id]
|
||||
r.mu.RUnlock()
|
||||
if !ok || entry == nil {
|
||||
return ErrMachineNotFound
|
||||
func (r *Runtime) Inspect(state MachineState) (*MachineState, error) {
|
||||
if state.PID > 0 && !processExists(state.PID) {
|
||||
state.Phase = PhaseFailed
|
||||
state.PID = 0
|
||||
state.Error = "firecracker process not found"
|
||||
}
|
||||
if entry.cmd == nil || entry.cmd.Process == nil {
|
||||
return fmt.Errorf("machine %q has no firecracker process", id)
|
||||
}
|
||||
if entry.state.Phase == PhaseStopped {
|
||||
return &state, nil
|
||||
}
|
||||
|
||||
func (r *Runtime) Stop(ctx context.Context, state MachineState) error {
|
||||
if state.PID < 1 || !processExists(state.PID) {
|
||||
return nil
|
||||
}
|
||||
|
||||
r.mu.Lock()
|
||||
entry.stopping = true
|
||||
process := entry.cmd.Process
|
||||
exited := entry.exited
|
||||
r.mu.Unlock()
|
||||
|
||||
process, err := os.FindProcess(state.PID)
|
||||
if err != nil {
|
||||
return fmt.Errorf("find process for machine %q: %w", state.ID, err)
|
||||
}
|
||||
if err := process.Signal(syscall.SIGTERM); err != nil && !errors.Is(err, os.ErrProcessDone) {
|
||||
return fmt.Errorf("stop machine %q: %w", id, err)
|
||||
return fmt.Errorf("stop machine %q: %w", state.ID, err)
|
||||
}
|
||||
|
||||
select {
|
||||
case <-exited:
|
||||
return nil
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
ticker := time.NewTicker(50 * time.Millisecond)
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
if !processExists(state.PID) {
|
||||
return nil
|
||||
}
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
case <-ticker.C:
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (r *Runtime) Delete(ctx context.Context, id MachineID) error {
|
||||
r.mu.RLock()
|
||||
entry, ok := r.machines[id]
|
||||
r.mu.RUnlock()
|
||||
if !ok || entry == nil {
|
||||
return ErrMachineNotFound
|
||||
func (r *Runtime) Delete(ctx context.Context, state MachineState) error {
|
||||
if err := r.Stop(ctx, state); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if entry.state.Phase == PhaseRunning {
|
||||
if err := r.Stop(ctx, id); err != nil && !errors.Is(err, context.Canceled) {
|
||||
if strings.TrimSpace(state.RuntimeHost) != "" && strings.TrimSpace(state.TapName) != "" {
|
||||
network, err := AllocationFromGuestIP(state.RuntimeHost, state.TapName)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if err := r.networkProvisioner.Remove(ctx, network); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
if err := r.networkProvisioner.Remove(ctx, entry.network); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := removeIfExists(hostVSockPath(entry.paths, entry.spec)); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := os.RemoveAll(entry.paths.BaseDir); err != nil {
|
||||
return fmt.Errorf("remove machine dir %q: %w", entry.paths.BaseDir, err)
|
||||
}
|
||||
|
||||
r.mu.Lock()
|
||||
delete(r.machines, id)
|
||||
r.mu.Unlock()
|
||||
paths, err := buildMachinePaths(r.rootDir, state.ID, r.firecrackerBinaryPath)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if err := os.RemoveAll(paths.BaseDir); err != nil {
|
||||
return fmt.Errorf("remove machine dir %q: %w", paths.BaseDir, err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (r *Runtime) watchMachine(id MachineID, command *exec.Cmd, exited chan struct{}) {
|
||||
err := command.Wait()
|
||||
close(exited)
|
||||
|
||||
r.mu.Lock()
|
||||
defer r.mu.Unlock()
|
||||
|
||||
entry, ok := r.machines[id]
|
||||
if !ok || entry == nil || entry.cmd != command {
|
||||
return
|
||||
}
|
||||
|
||||
entry.state.PID = 0
|
||||
if entry.stopping {
|
||||
entry.state.Phase = PhaseStopped
|
||||
entry.state.Error = ""
|
||||
entry.stopping = false
|
||||
return
|
||||
}
|
||||
if err != nil {
|
||||
entry.state.Phase = PhaseError
|
||||
entry.state.Error = err.Error()
|
||||
return
|
||||
}
|
||||
|
||||
entry.state.Phase = PhaseStopped
|
||||
entry.state.Error = ""
|
||||
}
|
||||
|
||||
func (r *Runtime) resolveVSock(spec MachineSpec, used map[uint32]struct{}) (MachineSpec, error) {
|
||||
if spec.Vsock != nil {
|
||||
if _, exists := used[spec.Vsock.CID]; exists {
|
||||
return MachineSpec{}, fmt.Errorf("vsock cid %d already in use", spec.Vsock.CID)
|
||||
}
|
||||
return spec, nil
|
||||
}
|
||||
|
||||
cid, err := nextVSockCID(used)
|
||||
if err != nil {
|
||||
return MachineSpec{}, err
|
||||
}
|
||||
|
||||
spec.Vsock = &VsockSpec{
|
||||
ID: defaultVSockID,
|
||||
CID: cid,
|
||||
Path: string(spec.ID) + ".sock",
|
||||
}
|
||||
return spec, nil
|
||||
}
|
||||
|
||||
func nextVSockCID(used map[uint32]struct{}) (uint32, error) {
|
||||
for cid := defaultVSockCIDStart; cid != 0; cid++ {
|
||||
if _, exists := used[cid]; !exists {
|
||||
return cid, nil
|
||||
}
|
||||
}
|
||||
return 0, fmt.Errorf("vsock cid space exhausted")
|
||||
}
|
||||
|
||||
func processExists(pid int) bool {
|
||||
if pid < 1 {
|
||||
return false
|
||||
|
|
@ -351,12 +214,19 @@ func processExists(pid int) bool {
|
|||
return err == nil || err == syscall.EPERM
|
||||
}
|
||||
|
||||
func removeIfExists(path string) error {
|
||||
if path == "" {
|
||||
return nil
|
||||
func pidOf(command *exec.Cmd) int {
|
||||
if command == nil || command.Process == nil {
|
||||
return 0
|
||||
}
|
||||
return command.Process.Pid
|
||||
}
|
||||
|
||||
func preserveFailureArtifacts() bool {
|
||||
value := strings.TrimSpace(os.Getenv(debugPreserveFailureEnv))
|
||||
switch strings.ToLower(value) {
|
||||
case "1", "true", "yes", "on":
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
if err := os.Remove(path); err != nil && !errors.Is(err, os.ErrNotExist) {
|
||||
return fmt.Errorf("remove path %q: %w", path, err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue