Files
kvm/failsafe.go
Marc Brooks d1027206bc Enhance synctrace logging (#980)
* Enhance synctrace logging.

Switched the maps to be indexed by the .Pointer (not a string)
Grouped the lockCount, unlockCount ,and lastLock in an trackingEntry so we can detect unlocks of something that wasn't ever locked and excessive unlocks and also tracks the first time locked and the last unlock time.
Added LogDangledLocks for debugging use.
Added a panic handler to the Main so we can log out panics

* Switch to traceable sync for most everything

* More documentation

* Update internal/sync/log.go

* Update DEVELOPMENT.md

* Resolve merge issue.

* Applied review comments

* Restore --enable-sync-trace option.

* Use WithLevel so we can re-panic as desired
2026-01-28 09:19:56 +01:00

137 lines
3.3 KiB
Go

package kvm
import (
"io"
"os"
"strings"
"github.com/jetkvm/kvm/internal/supervisor"
"github.com/jetkvm/kvm/internal/sync"
)
const (
failsafeDefaultLastCrashPath = "/userdata/jetkvm/crashdump/last-crash.log"
failsafeFile = "/userdata/jetkvm/.enablefailsafe"
failsafeLastCrashEnv = "JETKVM_LAST_ERROR_PATH"
failsafeEnv = "JETKVM_FORCE_FAILSAFE"
)
var (
failsafeOnce sync.Once
failsafeCrashLog = ""
failsafeModeActive = false
failsafeModeReason = ""
)
type FailsafeModeNotification struct {
Active bool `json:"active"`
Reason string `json:"reason"`
}
// this function has side effects and can be only executed once
func checkFailsafeReason() {
failsafeOnce.Do(func() {
// check if the failsafe environment variable is set
if os.Getenv(failsafeEnv) == "1" {
failsafeModeActive = true
failsafeModeReason = "failsafe_env_set"
return
}
// check if the failsafe file exists
if _, err := os.Stat(failsafeFile); err == nil {
failsafeModeActive = true
failsafeModeReason = "failsafe_file_exists"
_ = os.Remove(failsafeFile)
return
}
// get the last crash log path from the environment variable
lastCrashPath := os.Getenv(failsafeLastCrashEnv)
if lastCrashPath == "" {
lastCrashPath = failsafeDefaultLastCrashPath
}
// check if the last crash log file exists
l := failsafeLogger.With().Str("path", lastCrashPath).Logger()
fi, err := os.Lstat(lastCrashPath)
if err != nil {
if !os.IsNotExist(err) {
l.Warn().Err(err).Msg("failed to stat last crash log")
}
return
}
if fi.Mode()&os.ModeSymlink != os.ModeSymlink {
l.Warn().Msg("last crash log is not a symlink, ignoring")
return
}
// open the last crash log file and find if it contains the string "panic"
// read only the last 50KB to avoid memory issues with large log files
content, err := readFileTail(lastCrashPath, 50*1024)
if err != nil {
l.Warn().Err(err).Msg("failed to read last crash log")
return
}
// unlink the last crash log file
failsafeCrashLog = content
_ = os.Remove(lastCrashPath)
// TODO: read the goroutine stack trace and check which goroutine is panicking
failsafeModeActive = true
if strings.Contains(failsafeCrashLog, supervisor.FailsafeReasonVideoMaxRestartAttemptsReached) {
failsafeModeReason = "video"
return
}
if strings.Contains(failsafeCrashLog, "runtime.cgocall") {
failsafeModeReason = "video"
return
} else {
failsafeModeReason = "unknown"
}
})
}
// readFileTail reads at most maxBytes from the end of a file.
// This prevents memory issues when reading potentially large log files.
func readFileTail(path string, maxBytes int64) (string, error) {
f, err := os.Open(path)
if err != nil {
return "", err
}
defer f.Close()
fi, err := f.Stat()
if err != nil {
return "", err
}
size := fi.Size()
if size > maxBytes {
if _, err := f.Seek(size-maxBytes, io.SeekStart); err != nil {
return "", err
}
}
data, err := io.ReadAll(f)
if err != nil {
return "", err
}
return string(data), nil
}
func notifyFailsafeMode(session *Session) {
if !failsafeModeActive || session == nil {
return
}
jsonRpcLogger.Info().Str("reason", failsafeModeReason).Msg("sending failsafe mode notification")
writeJSONRPCEvent("failsafeMode", FailsafeModeNotification{
Active: true,
Reason: failsafeModeReason,
}, session)
}