chore: termination cleanup tighten + test

This commit is contained in:
Harivansh Rathi 2026-04-08 18:54:37 +00:00
parent 592df1e1df
commit 28ca0219d9
2 changed files with 176 additions and 1 deletions

View file

@ -14,6 +14,11 @@ import (
var ErrMachineNotFound = errors.New("machine not found")
var (
stopGracePeriod = 5 * time.Second
stopPollInterval = 50 * time.Millisecond
)
type RuntimeConfig struct {
RootDir string
FirecrackerBinaryPath string
@ -163,13 +168,21 @@ func (r *Runtime) Stop(ctx context.Context, state MachineState) error {
return fmt.Errorf("stop machine %q: %w", state.ID, err)
}
ticker := time.NewTicker(50 * time.Millisecond)
ticker := time.NewTicker(stopPollInterval)
defer ticker.Stop()
deadline := time.Now().Add(stopGracePeriod)
sentKill := false
for {
if !processExists(state.PID) {
return nil
}
if !sentKill && time.Now().After(deadline) {
if err := process.Signal(syscall.SIGKILL); err != nil && !errors.Is(err, os.ErrProcessDone) {
return fmt.Errorf("kill machine %q: %w", state.ID, err)
}
sentKill = true
}
select {
case <-ctx.Done():
return ctx.Err()
@ -206,6 +219,13 @@ func processExists(pid int) bool {
if pid < 1 {
return false
}
if payload, err := os.ReadFile(filepath.Join("/proc", fmt.Sprintf("%d", pid), "stat")); err == nil {
if marker := strings.LastIndexByte(string(payload), ')'); marker >= 0 && marker+2 < len(payload) {
if payload[marker+2] == 'Z' {
return false
}
}
}
err := syscall.Kill(pid, 0)
return err == nil || err == syscall.EPERM
}

View file

@ -0,0 +1,155 @@
package firecracker
import (
"context"
"os"
"os/exec"
"os/signal"
"path/filepath"
"syscall"
"testing"
"time"
)
const testHelperProcessEnv = "MICROAGENT_TEST_HELPER_PROCESS"
const testHelperReadyFileEnv = "MICROAGENT_TEST_HELPER_READY_FILE"
func TestStopEscalatesToSIGKILL(t *testing.T) {
if os.Getenv(testHelperProcessEnv) == "ignore-term" {
runIgnoreTERMHelper()
return
}
readyFile := filepath.Join(t.TempDir(), "ready")
command := exec.Command(os.Args[0], "-test.run=TestStopEscalatesToSIGKILL")
command.Env = append(
os.Environ(),
testHelperProcessEnv+"=ignore-term",
testHelperReadyFileEnv+"="+readyFile,
)
if err := command.Start(); err != nil {
t.Fatalf("start helper process: %v", err)
}
t.Cleanup(func() {
_ = command.Process.Kill()
_ = command.Wait()
})
waitForHelperReady(t, readyFile)
restore := setStopTimings(20*time.Millisecond, 5*time.Millisecond)
defer restore()
runtime := &Runtime{}
ctx, cancel := context.WithTimeout(context.Background(), time.Second)
defer cancel()
if err := runtime.Stop(ctx, MachineState{ID: "vm-1", PID: command.Process.Pid}); err != nil {
t.Fatalf("stop machine: %v", err)
}
if processExists(command.Process.Pid) {
t.Fatalf("process %d still running after stop", command.Process.Pid)
}
}
func TestDeleteRemovesRuntimeDirAfterForcedKill(t *testing.T) {
if os.Getenv(testHelperProcessEnv) == "ignore-term" {
runIgnoreTERMHelper()
return
}
root := t.TempDir()
paths, err := buildMachinePaths(root, "vm-1", "/usr/bin/firecracker")
if err != nil {
t.Fatalf("build machine paths: %v", err)
}
if err := os.MkdirAll(paths.BaseDir, 0o755); err != nil {
t.Fatalf("create machine base dir: %v", err)
}
sentinelPath := filepath.Join(paths.BaseDir, "sentinel")
if err := os.WriteFile(sentinelPath, []byte("ok"), 0o644); err != nil {
t.Fatalf("write sentinel file: %v", err)
}
readyFile := filepath.Join(t.TempDir(), "ready")
command := exec.Command(os.Args[0], "-test.run=TestDeleteRemovesRuntimeDirAfterForcedKill")
command.Env = append(
os.Environ(),
testHelperProcessEnv+"=ignore-term",
testHelperReadyFileEnv+"="+readyFile,
)
if err := command.Start(); err != nil {
t.Fatalf("start helper process: %v", err)
}
t.Cleanup(func() {
_ = command.Process.Kill()
_ = command.Wait()
})
waitForHelperReady(t, readyFile)
restore := setStopTimings(20*time.Millisecond, 5*time.Millisecond)
defer restore()
runtime := &Runtime{
rootDir: root,
firecrackerBinaryPath: "/usr/bin/firecracker",
networkProvisioner: &IPTapProvisioner{
runCommand: func(context.Context, string, ...string) error { return nil },
},
}
ctx, cancel := context.WithTimeout(context.Background(), time.Second)
defer cancel()
state := MachineState{
ID: "vm-1",
PID: command.Process.Pid,
}
if err := runtime.Delete(ctx, state); err != nil {
t.Fatalf("delete machine: %v", err)
}
if _, err := os.Stat(paths.BaseDir); !os.IsNotExist(err) {
t.Fatalf("machine dir still exists after delete: %v", err)
}
}
func runIgnoreTERMHelper() {
signals := make(chan os.Signal, 1)
signal.Notify(signals, syscall.SIGTERM)
defer signal.Stop(signals)
if readyFile := os.Getenv(testHelperReadyFileEnv); readyFile != "" {
_ = os.WriteFile(readyFile, []byte("ready"), 0o644)
}
for {
<-signals
}
}
func setStopTimings(grace time.Duration, poll time.Duration) func() {
previousGrace := stopGracePeriod
previousPoll := stopPollInterval
stopGracePeriod = grace
stopPollInterval = poll
return func() {
stopGracePeriod = previousGrace
stopPollInterval = previousPoll
}
}
func waitForHelperReady(t *testing.T, readyFile string) {
t.Helper()
timeout := time.NewTimer(time.Second)
defer timeout.Stop()
ticker := time.NewTicker(5 * time.Millisecond)
defer ticker.Stop()
for {
if _, err := os.Stat(readyFile); err == nil {
return
}
select {
case <-timeout.C:
t.Fatalf("timed out waiting for helper ready file %q", readyFile)
case <-ticker.C:
}
}
}