Fixed most bugs with spawning services (last: block signals)

This commit is contained in:
nocturn9x 2021-12-04 17:31:18 +01:00
parent 8517cdb736
commit 41a54288e3
6 changed files with 133 additions and 86 deletions

View File

@ -240,7 +240,6 @@ proc mountRealDisks*(logger: Logger, fstab: string = "/etc/fstab") =
if checkDiskIsMounted(entry, expand=true): if checkDiskIsMounted(entry, expand=true):
logger.debug(&"Skipping mounting filesystem {entry.source} ({entry.fstype}) at {entry.target}: already mounted") logger.debug(&"Skipping mounting filesystem {entry.source} ({entry.fstype}) at {entry.target}: already mounted")
continue continue
logger.debug(&"fsck returned status code {retcode}")
logger.debug(&"Mounting filesystem {entry.source} ({entry.fstype}) at {entry.target} with mount option(s) {entry.data}") logger.debug(&"Mounting filesystem {entry.source} ({entry.fstype}) at {entry.target} with mount option(s) {entry.data}")
logger.trace(&"Calling mount('{entry.source}', '{entry.target}', '{entry.fstype}', {entry.mountflags}, '{entry.data}')") logger.trace(&"Calling mount('{entry.source}', '{entry.target}', '{entry.fstype}', {entry.mountflags}, '{entry.data}')")
retcode = mount(entry.source, entry.target, entry.fstype, entry.mountflags, entry.data) retcode = mount(entry.source, entry.target, entry.fstype, entry.mountflags, entry.data)

View File

@ -11,10 +11,10 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import segfaults # Makes us catch segfaults as NilAccessDefect exceptions!
import strformat import strformat
import ../util/[logging, misc] import ../util/[logging, misc]
import services import services
@ -27,8 +27,8 @@ proc mainLoop*(logger: Logger) =
logger.info("System initialization complete, going idle") logger.info("System initialization complete, going idle")
while true: while true:
try: try:
# TODO logger.info(&"Working...")
sleepSeconds(5) sleepSeconds(30)
except: except:
logger.critical(&"A critical error has occurred while running, restarting the mainloop! Error -> {getCurrentExceptionMsg()}") logger.critical(&"A critical error has occurred while running, restarting the mainloop! Error -> {getCurrentExceptionMsg()}")
# We *absolutely* cannot die # We *absolutely* cannot die

View File

@ -100,93 +100,102 @@ proc removeService*(service: Service) =
proc supervisorWorker(logger: Logger, service: Service, pid: int) = proc supervisorWorker(logger: Logger, service: Service, pid: int) =
## This is the actual worker that supervises the service process ## This is the actual worker that supervises the service process
logger.trace(&"New supervisor for service '{service.name}' has been spawned")
var pid = pid var pid = pid
var status: cint var status: cint
var returnCode: int var returnCode: int
var sig: int var sig: int
var process: Process
while true: while true:
returnCode = posix.waitPid(Pid(pid), status, WUNTRACED) returnCode = posix.waitPid(cint(pid), status, WUNTRACED)
if WIFEXITED(status): if WIFEXITED(status):
sig = 0 sig = 0
elif WIFSIGNALED(status): elif WIFSIGNALED(status):
sig = WTERMSIG(status) sig = WTERMSIG(status)
else: else:
sig = -1 sig = -1
if service.restartOnFailure and sig > 0: if sig > 0 and service.restartOnFailure:
logger.info(&"Service {service.name} has exited with return code {returnCode} (terminated by signal {sig}: {strsignal(cint(sig))}), sleeping {service.restartDelay} seconds before restarting it") logger.info(&"Service '{service.name}' ({returnCode}) has crashed (terminated by signal {sig}: {strsignal(cint(sig))}), sleeping {service.restartDelay} seconds before restarting it")
removeManagedProcess(pid) removeManagedProcess(pid)
sleepSeconds(service.restartDelay) sleepSeconds(service.restartDelay)
var split = shlex(service.exec) var split = shlex(service.exec)
if split.error: if split.error:
logger.error(&"Error while starting service {service.name}: invalid exec syntax") logger.error(&"Error while restarting service '{service.name}': invalid exec syntax")
return break
var arguments = split.words var arguments = split.words
let progName = arguments[0] let progName = arguments[0]
arguments = arguments[1..^1] arguments = arguments[1..^1]
pid = startProcess(progName, workingDir=service.workDir, args=arguments, options={poParentStreams}).processID() process = startProcess(progName, workingDir=service.workDir, args=arguments)
else: pid = process.processID()
logger.info(&"Service {service.name} has exited with return code {returnCode}), shutting down controlling process") elif sig > 0:
logger.info(&"Service '{service.name}' ({returnCode}) has crashed (terminated by signal {sig}: {strsignal(cint(sig))}), shutting down controlling process")
break break
else:
logger.info(&"Service '{service.name}' ({returnCode}) has exited, shutting down controlling process")
break
if process != nil:
process.close()
proc startService(logger: Logger, service: Service) = proc startService(logger: Logger, service: Service) =
## Starts a single service (this is called by ## Starts a single service (this is called by
## startServices below until all services have ## startServices below until all services have
## been started) ## been started)
var process: Process
var split = shlex(service.exec)
if split.error:
logger.error(&"Error while starting service {service.name}: invalid exec syntax")
return
var arguments = split.words
let progName = arguments[0]
arguments = arguments[1..^1]
try: try:
var process = startProcess(progName, workingDir=service.workDir, args=arguments, options={poParentStreams, }) var split = shlex(service.exec)
if split.error:
logger.error(&"Error while starting service '{service.name}': invalid exec syntax")
quit(0)
var arguments = split.words
let progName = arguments[0]
arguments = arguments[1..^1]
process = startProcess(progName, workingDir=service.workDir, args=arguments)
if service.supervised: if service.supervised:
supervisorWorker(logger, service, process.processID) supervisorWorker(logger, service, process.processID)
except OSError: # If the service is unsupervised we just exit
except:
logger.error(&"Error while starting service {service.name}: {getCurrentExceptionMsg()}") logger.error(&"Error while starting service {service.name}: {getCurrentExceptionMsg()}")
if process != nil:
process.close()
quit(0) quit(0)
proc startServices*(logger: Logger, workers: int = 1, level: RunLevel) = proc startServices*(logger: Logger, level: RunLevel, workers: int = 1) =
## Starts the services in the given ## Starts the registered services in the
## runlevel. The workers parameter ## given runlevel
## configures parallelism and allows if workers > cpuinfo.countProcessors() - 1:
## for faster boot times by starting logger.warning(&"The configured number of workers ({workers}) is greater than the recommended one ({cpuinfo.countProcessors() - 1}), performance may degrade")
## services concurrently rather than var workerCount: int = 0
## sequentially (1 to disable parallelism). var status: cint
## Note this function immediately returns to
## the caller and forks in the background
echo posix.getpid()
discard readLine(stdin)
var pid: int = posix.fork() var pid: int = posix.fork()
if pid == -1: if pid == -1:
logger.fatal(&"Could not fork(): {posix.strerror(posix.errno)}") logger.error(&"Error, cannot fork: {posix.strerror(posix.errno)}")
return
elif pid == 0: elif pid == 0:
quit(0) logger.debug("Started service spawner process")
var servicesCopy: seq[Service] = @[] var servicesCopy: seq[Service] = @[]
echo servicesCopy.len(), " ", posix.getpid() for service in services:
for service in services: if service.runlevel == level:
if service.runlevel == level: servicesCopy.add(service)
servicesCopy.add(service) while servicesCopy.len() > 0:
echo servicesCopy.len(), " ", posix.getpid() if workerCount == workers:
if workers > cpuinfo.countProcessors() * 2 - 1: discard waitPid(cint(pid), status, WUNTRACED)
logger.warning(&"The configured workers count is beyond the recommended threshold ({workers} > {cpuinfo.countProcessors() * 2 - 1}), performance may degrade") dec(workerCount)
while servicesCopy.len() > 0:
echo servicesCopy.len(), " ", posix.getpid()
sleepSeconds(5)
for i in 1..workers:
pid = posix.fork() pid = posix.fork()
if pid == -1: if pid == -1:
logger.error(&"An error occurred while forking to spawn services, trying again: {posix.strerror(posix.errno)}") logger.error(&"An error occurred while forking to spawn services, trying again: {posix.strerror(posix.errno)}")
elif pid == 0: elif pid == 0:
logger.info(&"Starting service {servicesCopy[0].name}") logger.trace(&"New child has been spawned")
if not servicesCopy[0].supervised:
logger.info(&"Starting unsupervised service '{servicesCopy[0].name}'")
else:
logger.info(&"Starting supervised service '{servicesCopy[0].name}'")
startService(logger, servicesCopy[0]) startService(logger, servicesCopy[0])
else: elif servicesCopy.len() > 0:
if servicesCopy.len() > 0 and servicesCopy[0].supervised: workerCount += 1
if servicesCopy[0].supervised:
addManagedProcess(pid, servicesCopy[0]) addManagedProcess(pid, servicesCopy[0])
servicesCopy.del(0)
quit(0)
else:
discard waitPid(cint(pid), status, WUNTRACED)

View File

@ -79,6 +79,15 @@ proc nimDExit*(logger: Logger, code: int, emerg: bool = true) =
## NimD's exit point. This function tries to shut down ## NimD's exit point. This function tries to shut down
## as cleanly as possible. When emerg equals true, it will ## as cleanly as possible. When emerg equals true, it will
## try to spawn a root shell and exit ## try to spawn a root shell and exit
if emerg:
var status: cint
# We're in emergency mode: do not crash the kernel, spawn a shell and exit
logger.fatal("NimD has entered emergency mode and cannot continue. You will be now (hopefully) dropped in a root shell: you're on your own. May the force be with you")
logger.info("Terminating child processes with SIGKILL")
discard posix.kill(SIGKILL, -1)
discard posix.waitPid(-1, status, 0)
discard execShellCmd("/bin/sh") # TODO: Is this fine? maybe use execProcess
quit(-1)
logger.warning("The system is shutting down") logger.warning("The system is shutting down")
logger.info("Processing shutdown runlevel") logger.info("Processing shutdown runlevel")
# TODO # TODO
@ -89,22 +98,15 @@ proc nimDExit*(logger: Logger, code: int, emerg: bool = true) =
except: except:
logger.error(&"An error has occurred while calling shutdown handlers. Error -> {getCurrentExceptionMsg()}") logger.error(&"An error has occurred while calling shutdown handlers. Error -> {getCurrentExceptionMsg()}")
# Note: continues calling handlers! # Note: continues calling handlers!
if emerg: logger.info("Terminating child processes with SIGTERM")
# We're in emergency mode: do not crash the kernel, spawn a shell and exit logger.debug(&"Waiting up to {sigTermDelay} seconds for the kernel to deliver signals")
logger.fatal("NimD has entered emergency mode and cannot continue. You will be now (hopefully) dropped in a root shell: you're on your own. May the force be with you") discard posix.kill(SIGTERM, -1) # The kernel handles this for us asynchronously
discard execShellCmd("/bin/sh") # TODO: Is this fine? maybe use execProcess var t = cpuTime()
else: # We wait some time for the signals to propagate
logger.info("Terminating child processes with SIGTERM") while anyUserlandProcessLeft() or cpuTime() - t >= sigTermDelay:
logger.debug(&"Waiting {sigTermDelay} seconds for the kernel to deliver signals") sleepSeconds(0.25)
discard posix.kill(SIGTERM, -1) # The kernel handles this for us asynchronously if anyUserlandProcessLeft():
var t = cpuTime() logger.info("Terminating child processes with SIGKILL")
# We wait some time for the signals to propagate discard posix.kill(SIGKILL, -1)
while anyUserlandProcessLeft(): logger.warning("Shutdown procedure complete, sending final termination signal")
sleepSeconds(0.25) quit(code)
if cpuTime() - t >= sigTermDelay:
break
if anyUserlandProcessLeft():
logger.info("Terminating child processes with SIGKILL")
discard posix.kill(SIGKILL, -1)
logger.warning("Shutdown procedure complete, sending final termination signal")
quit(code)

View File

@ -39,8 +39,8 @@ proc main(logger: Logger, mountDisks: bool = true, fstab: string = "/etc/fstab")
logger.fatal(&"NimD must run as root, but current user id is {uid}") logger.fatal(&"NimD must run as root, but current user id is {uid}")
nimDExit(logger, EPERM) # EPERM - Operation not permitted nimDExit(logger, EPERM) # EPERM - Operation not permitted
logger.trace("Setting up signal handlers") logger.trace("Setting up signal handlers")
onSignal(SIGABRT, SIGALRM, SIGHUP, SIGILL, SIGKILL, SIGQUIT, SIGSTOP, SIGTSTP, onSignal(SIGABRT, SIGALRM, SIGHUP, SIGILL, SIGKILL, SIGQUIT, SIGSTOP, SIGSEGV, SIGTSTP,
SIGTRAP, SIGTERM, SIGPIPE, SIGUSR1, SIGUSR2, 6, SIGFPE, SIGBUS, SIGURG, SIGINT): # 6 is SIGIOT SIGTRAP, SIGPIPE, SIGUSR1, SIGUSR2, 6, SIGFPE, SIGBUS, SIGURG, SIGTERM, SIGINT): # 6 is SIGIOT
# Can't capture local variables because this implicitly generates # Can't capture local variables because this implicitly generates
# a noconv procedure, so we use getDefaultLogger() instead. Must find # a noconv procedure, so we use getDefaultLogger() instead. Must find
# a better solution long-term because we need the configuration from # a better solution long-term because we need the configuration from
@ -51,6 +51,9 @@ proc main(logger: Logger, mountDisks: bool = true, fstab: string = "/etc/fstab")
# One of the key features of an init system is reaping child # One of the key features of an init system is reaping child
# processes! # processes!
reapProcess(getDefaultLogger()) reapProcess(getDefaultLogger())
onSignal(SIGINT):
# Temporary
nimDExit(getDefaultLogger(), 131, emerg=true)
addSymlink(newSymlink(dest="/dev/fd", source="/proc/self/fd")) addSymlink(newSymlink(dest="/dev/fd", source="/proc/self/fd"))
addSymlink(newSymlink(dest="/dev/fd/0", source="/proc/self/fd/0")) addSymlink(newSymlink(dest="/dev/fd/0", source="/proc/self/fd/0"))
addSymlink(newSymlink(dest="/dev/fd/1", source="/proc/self/fd/1")) addSymlink(newSymlink(dest="/dev/fd/1", source="/proc/self/fd/1"))
@ -90,12 +93,38 @@ proc main(logger: Logger, mountDisks: bool = true, fstab: string = "/etc/fstab")
logger.debug(&"Hostname was set to '{setHostname(logger)}'") logger.debug(&"Hostname was set to '{setHostname(logger)}'")
logger.info("Creating symlinks") logger.info("Creating symlinks")
createSymlinks(logger) createSymlinks(logger)
logger.info("Creating directories")
createDirectories(logger) createDirectories(logger)
logger.debug("Entering critical fork() section: blocking signals")
var sigset: Sigset
# TODO
logger.info("Processing boot runlevel") logger.info("Processing boot runlevel")
addService(newService(name="Test", description="prints owo", exec="/bin/echo owo", addService(newService(name="echoer", description="prints owo", exec="/bin/echo owo",
runlevel=Boot, kind=Oneshot, workDir=getCurrentDir(), runlevel=Boot, kind=Oneshot, workDir=getCurrentDir(),
supervised=false, restartOnFailure=false, restartDelay=0)) supervised=false, restartOnFailure=false, restartDelay=0))
startServices(logger, workers=1, level=Boot) addService(newService(name="sleeper", description="la mamma di licenziato",
exec="/usr/bin/sleep 10", supervised=true, restartOnFailure=true,
restartDelay=5, runlevel=Boot, workDir="/home", kind=Simple))
addService(newService(name="errorer", description="la mamma di gavd",
exec="/bin/false", supervised=true, restartOnFailure=true,
restartDelay=5, runlevel=Boot, workDir="/", kind=Simple))
startServices(logger, workers=2, level=Boot)
logger.debug("Setting up real signal handlers")
onSignal(SIGABRT, SIGALRM, SIGHUP, SIGILL, SIGKILL, SIGQUIT, SIGSTOP, SIGSEGV, SIGTSTP,
SIGTRAP, SIGPIPE, SIGUSR1, SIGUSR2, 6, SIGFPE, SIGBUS, SIGURG, SIGTERM): # 6 is SIGIOT
# Can't capture local variables because this implicitly generates
# a noconv procedure, so we use getDefaultLogger() instead. Must find
# a better solution long-term because we need the configuration from
# our own logger object (otherwise we'd always create a new one and
# never switch our logs to file once booting is completed)
getDefaultLogger().warning(&"Ignoring signal {sig} ({strsignal(sig)})") # Nim injects the variable "sig" into the scope. Gotta love those macros
onSignal(SIGCHLD):
# One of the key features of an init system is reaping child
# processes!
reapProcess(getDefaultLogger())
onSignal(SIGINT):
# Temporary
nimDExit(getDefaultLogger(), 131, emerg=true)
logger.debug("Starting main loop") logger.debug("Starting main loop")
mainLoop(logger) mainLoop(logger)
@ -144,7 +173,6 @@ when isMainModule:
try: try:
main(logger) main(logger)
except: except:
raise
logger.fatal(&"A fatal unrecoverable error has occurred during startup and NimD cannot continue: {getCurrentExceptionMsg()}") logger.fatal(&"A fatal unrecoverable error has occurred during startup and NimD cannot continue: {getCurrentExceptionMsg()}")
nimDExit(logger, 131) # ENOTRECOVERABLE - State not recoverable nimDExit(logger, 131) # ENOTRECOVERABLE - State not recoverable
# This will almost certainly cause the kernel to crash with an error the likes of "Kernel not syncing, attempted to kill init!", # This will almost certainly cause the kernel to crash with an error the likes of "Kernel not syncing, attempted to kill init!",

View File

@ -15,6 +15,7 @@
# A simple logging module inspired by python's own logging facility # A simple logging module inspired by python's own logging facility
import terminal import terminal
import strformat import strformat
import posix
import times import times
@ -73,47 +74,55 @@ proc getDefaultLogger*(): Logger =
## level that writes the given message to the ## level that writes the given message to the
## standard error with some basic info like the ## standard error with some basic info like the
## current date and time and the log level ## current date and time and the log level
setStdIoUnbuffered() # Just in case setStdIoUnbuffered() # Just in case
proc logTrace(self: LogHandler, logger: Logger, message: string) = proc logTrace(self: LogHandler, logger: Logger, message: string) =
setForegroundColor(fgMagenta) setForegroundColor(fgMagenta)
stderr.writeLine(&"""[{fromUnix(getTime().toUnixFloat().int).format("d/M/yyyy HH:mm:ss")} - TRACE] {message}""") stderr.writeLine(&"""[{fromUnix(getTime().toUnixFloat().int).format("d/M/yyyy HH:mm:ss")} - TRACE ({posix.getpid()})] {message}""")
stderr.flushFile()
setForegroundColor(fgDefault) setForegroundColor(fgDefault)
proc logDebug(self: LogHandler, logger: Logger, message: string) = proc logDebug(self: LogHandler, logger: Logger, message: string) =
setForegroundColor(fgCyan) setForegroundColor(fgCyan)
stderr.writeLine(&"""[{fromUnix(getTime().toUnixFloat().int).format("d/M/yyyy HH:mm:ss")} - DEBUG] {message}""") stderr.writeLine(&"""[{fromUnix(getTime().toUnixFloat().int).format("d/M/yyyy HH:mm:ss")} - DEBUG ({posix.getpid()})] {message}""")
stderr.flushFile()
setForegroundColor(fgDefault) setForegroundColor(fgDefault)
proc logInfo(self: LogHandler, logger: Logger, message: string) = proc logInfo(self: LogHandler, logger: Logger, message: string) =
setForegroundColor(fgGreen) setForegroundColor(fgGreen)
stderr.writeLine(&"""[{fromUnix(getTime().toUnixFloat().int).format("d/M/yyyy HH:mm:ss")} - INFO] {message}""") stderr.writeLine(&"""[{fromUnix(getTime().toUnixFloat().int).format("d/M/yyyy HH:mm:ss")} - INFO ({posix.getpid()})] {message}""")
stderr.flushFile()
setForegroundColor(fgDefault) setForegroundColor(fgDefault)
proc logWarning(self: LogHandler, logger: Logger, message: string) = proc logWarning(self: LogHandler, logger: Logger, message: string) =
setForegroundColor(fgYellow) setForegroundColor(fgYellow)
stderr.writeLine(&"""[{fromUnix(getTime().toUnixFloat().int).format("d/M/yyyy HH:mm:ss")} - WARNING] {message}""") stderr.writeLine(&"""[{fromUnix(getTime().toUnixFloat().int).format("d/M/yyyy HH:mm:ss")} - WARNING ({posix.getpid()})] {message}""")
stderr.flushFile()
setForegroundColor(fgDefault) setForegroundColor(fgDefault)
proc logError(self: LogHandler, logger: Logger, message: string) = proc logError(self: LogHandler, logger: Logger, message: string) =
setForegroundColor(fgRed) setForegroundColor(fgRed)
stderr.writeLine(&"""[{fromUnix(getTime().toUnixFloat().int).format("d/M/yyyy HH:mm:ss")} - ERROR] {message}""") stderr.writeLine(&"""[{fromUnix(getTime().toUnixFloat().int).format("d/M/yyyy HH:mm:ss")} - ERROR ({posix.getpid()})] {message}""")
stderr.flushFile()
setForegroundColor(fgDefault) setForegroundColor(fgDefault)
proc logCritical(self: LogHandler, logger: Logger, message: string) = proc logCritical(self: LogHandler, logger: Logger, message: string) =
setForegroundColor(fgRed) setForegroundColor(fgRed)
stderr.writeLine(&"""[{fromUnix(getTime().toUnixFloat().int).format("d/M/yyyy HH:mm:ss")} - CRITICAL] {message}""") stderr.writeLine(&"""[{fromUnix(getTime().toUnixFloat().int).format("d/M/yyyy HH:mm:ss")} - CRITICAL ({posix.getpid()})] {message}""")
stderr.flushFile()
setForegroundColor(fgDefault) setForegroundColor(fgDefault)
proc logFatal(self: LogHandler, logger: Logger, message: string) = proc logFatal(self: LogHandler, logger: Logger, message: string) =
setForegroundColor(fgBlack) setForegroundColor(fgBlack)
setBackgroundColor(bgRed) setBackgroundColor(bgRed)
stderr.write(&"""[{fromUnix(getTime().toUnixFloat().int).format("d/M/yyyy HH:mm:ss")} - FATAL]""") stderr.write(&"""[{fromUnix(getTime().toUnixFloat().int).format("d/M/yyyy HH:mm:ss")} - FATAL ({posix.getpid()})]""")
setForegroundColor(fgRed) setForegroundColor(fgRed)
setBackgroundColor(bgDefault) setBackgroundColor(bgDefault)
stderr.writeline(&" {message}") stderr.writeline(&" {message}")
setForegroundColor(fgDefault) setForegroundColor(fgDefault)
stderr.flushFile()
result = newLogger() result = newLogger()
result.addHandler(createHandler(logTrace, LogLevel.Trace)) result.addHandler(createHandler(logTrace, LogLevel.Trace))