Fixed most bugs with spawning services (last: block signals)

2021-12-04 17:31:18 +01:00 · 2021-12-04 17:31:18 +01:00 · 41a54288e3
parent 8517cdb736
commit 41a54288e3
6 changed files with 133 additions and 86 deletions
--- a/src/core/fs.nim
+++ b/src/core/fs.nim
@ -240,7 +240,6 @@ proc mountRealDisks*(logger: Logger, fstab: string = "/etc/fstab") =
            if checkDiskIsMounted(entry, expand=true):
                logger.debug(&"Skipping mounting filesystem {entry.source} ({entry.fstype}) at {entry.target}: already mounted")
                continue
            logger.debug(&"fsck returned status code {retcode}")
            logger.debug(&"Mounting filesystem {entry.source} ({entry.fstype}) at {entry.target} with mount option(s) {entry.data}")
            logger.trace(&"Calling mount('{entry.source}', '{entry.target}', '{entry.fstype}', {entry.mountflags}, '{entry.data}')")
            retcode = mount(entry.source, entry.target, entry.fstype, entry.mountflags, entry.data)
--- a/src/core/mainloop.nim
+++ b/src/core/mainloop.nim
@ -11,10 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import segfaults   # Makes us catch segfaults as NilAccessDefect exceptions!
 import strformat
 import ../util/[logging, misc]
 import services
@ -27,8 +27,8 @@ proc mainLoop*(logger: Logger) =
    logger.info("System initialization complete, going idle")
    while true:
        try:
-            # TODO
+            logger.info(&"Working...")
-            sleepSeconds(5)
+            sleepSeconds(30)
        except:
            logger.critical(&"A critical error has occurred while running, restarting the mainloop! Error -> {getCurrentExceptionMsg()}")
            # We *absolutely* cannot die
--- a/src/core/services.nim
+++ b/src/core/services.nim
@ -100,93 +100,102 @@ proc removeService*(service: Service) =
 proc supervisorWorker(logger: Logger, service: Service, pid: int) =
    ## This is the actual worker that supervises the service process
    logger.trace(&"New supervisor for service '{service.name}' has been spawned")
    var pid = pid
    var status: cint
    var returnCode: int
    var sig: int
    var process: Process
    while true:
-        returnCode = posix.waitPid(Pid(pid), status, WUNTRACED)
+        returnCode = posix.waitPid(cint(pid), status, WUNTRACED)
        if WIFEXITED(status):
            sig = 0
        elif WIFSIGNALED(status):
            sig = WTERMSIG(status)
        else:
            sig = -1
-        if service.restartOnFailure and sig > 0:
+        if sig > 0 and service.restartOnFailure:
-            logger.info(&"Service {service.name} has exited with return code {returnCode} (terminated by signal {sig}: {strsignal(cint(sig))}), sleeping {service.restartDelay} seconds before restarting it")
+            logger.info(&"Service '{service.name}' ({returnCode}) has crashed (terminated by signal {sig}: {strsignal(cint(sig))}), sleeping {service.restartDelay} seconds before restarting it")
            removeManagedProcess(pid)
            sleepSeconds(service.restartDelay)
            var split = shlex(service.exec)
            if split.error:
-                logger.error(&"Error while starting service {service.name}: invalid exec syntax")
+                logger.error(&"Error while restarting service '{service.name}': invalid exec syntax")
-                return
+                break
            var arguments = split.words
            let progName = arguments[0]
            arguments = arguments[1..^1]
-            pid = startProcess(progName, workingDir=service.workDir, args=arguments, options={poParentStreams}).processID()
+            process = startProcess(progName, workingDir=service.workDir, args=arguments)
-        else:
+            pid = process.processID()
-            logger.info(&"Service {service.name} has exited with return code {returnCode}), shutting down controlling process")
+        elif sig > 0:
            logger.info(&"Service '{service.name}' ({returnCode}) has crashed (terminated by signal {sig}: {strsignal(cint(sig))}), shutting down controlling process")
            break
-    
+        else:
-    
+            logger.info(&"Service '{service.name}' ({returnCode}) has exited, shutting down controlling process")
            break
    if process != nil:
        process.close()
 proc startService(logger: Logger, service: Service) =
    ## Starts a single service (this is called by
    ## startServices below until all services have
    ## been started)
-    
+    var process: Process
    var split = shlex(service.exec)
    if split.error:
        logger.error(&"Error while starting service {service.name}: invalid exec syntax")
        return
    var arguments = split.words
    let progName = arguments[0]
    arguments = arguments[1..^1]
    try:
-        var process = startProcess(progName, workingDir=service.workDir, args=arguments, options={poParentStreams, })
+        var split = shlex(service.exec)
        if split.error:
            logger.error(&"Error while starting service '{service.name}': invalid exec syntax")
            quit(0)
        var arguments = split.words
        let progName = arguments[0]
        arguments = arguments[1..^1]
        process = startProcess(progName, workingDir=service.workDir, args=arguments)
        if service.supervised:
            supervisorWorker(logger, service, process.processID)
-    except OSError:
+        # If the service is unsupervised we just exit
    except:
        logger.error(&"Error while starting service {service.name}: {getCurrentExceptionMsg()}")
    if process != nil:
        process.close()
    quit(0)
-proc startServices*(logger: Logger, workers: int = 1, level: RunLevel) =
+proc startServices*(logger: Logger, level: RunLevel, workers: int = 1) =
-    ## Starts the services in the given
+    ## Starts the registered services in the 
-    ## runlevel. The workers parameter
+    ## given runlevel
-    ## configures parallelism and allows
+    if workers > cpuinfo.countProcessors() - 1:
-    ## for faster boot times by starting
+        logger.warning(&"The configured number of workers ({workers}) is greater than the recommended one ({cpuinfo.countProcessors() - 1}), performance may degrade")
-    ## services concurrently rather than 
+    var workerCount: int = 0
-    ## sequentially (1 to disable parallelism).
+    var status: cint
    ## Note this function immediately returns to
    ## the caller and forks in the background
    echo posix.getpid()
    discard readLine(stdin)
    var pid: int = posix.fork()
    if pid == -1:
-        logger.fatal(&"Could not fork(): {posix.strerror(posix.errno)}")
+        logger.error(&"Error, cannot fork: {posix.strerror(posix.errno)}")
        return
    elif pid == 0:
-        quit(0)
+        logger.debug("Started service spawner process")
-    var servicesCopy: seq[Service] = @[]
+        var servicesCopy: seq[Service] = @[]
-    echo servicesCopy.len(), " ", posix.getpid()
+        for service in services:
-    for service in services:
+            if service.runlevel == level:
-        if service.runlevel == level:
+                servicesCopy.add(service)
-            servicesCopy.add(service)
+        while servicesCopy.len() > 0:
-    echo servicesCopy.len(), " ", posix.getpid()
+            if workerCount == workers:
-    if workers > cpuinfo.countProcessors() * 2 - 1:
+                discard waitPid(cint(pid), status, WUNTRACED)
-        logger.warning(&"The configured workers count is beyond the recommended threshold ({workers} > {cpuinfo.countProcessors() * 2 - 1}), performance may degrade")
+                dec(workerCount)
    while servicesCopy.len() > 0:
        echo servicesCopy.len(), " ", posix.getpid()
        sleepSeconds(5)
        for i in 1..workers:
            pid = posix.fork()
            if pid == -1:
                logger.error(&"An error occurred while forking to spawn services, trying again: {posix.strerror(posix.errno)}")
            elif pid == 0:
-                logger.info(&"Starting service {servicesCopy[0].name}")
+                logger.trace(&"New child has been spawned")
                if not servicesCopy[0].supervised:
                    logger.info(&"Starting unsupervised service '{servicesCopy[0].name}'")
                else:
                    logger.info(&"Starting supervised service '{servicesCopy[0].name}'")
                startService(logger, servicesCopy[0])
-            else:
+            elif servicesCopy.len() > 0:
-                if servicesCopy.len() > 0 and servicesCopy[0].supervised:
+                workerCount += 1
                if servicesCopy[0].supervised:
                    addManagedProcess(pid, servicesCopy[0])
-            
+                servicesCopy.del(0)
        quit(0)
    else:
        discard waitPid(cint(pid), status, WUNTRACED)
--- a/src/core/shutdown.nim
+++ b/src/core/shutdown.nim
@ -79,6 +79,15 @@ proc nimDExit*(logger: Logger, code: int, emerg: bool = true) =
    ## NimD's exit point. This function tries to shut down
    ## as cleanly as possible. When emerg equals true, it will
    ## try to spawn a root shell and exit
    if emerg:
        var status: cint
        # We're in emergency mode: do not crash the kernel, spawn a shell and exit
        logger.fatal("NimD has entered emergency mode and cannot continue. You will be now (hopefully) dropped in a root shell: you're on your own. May the force be with you")
        logger.info("Terminating child processes with SIGKILL")
        discard posix.kill(SIGKILL, -1)
        discard posix.waitPid(-1, status, 0)
        discard execShellCmd("/bin/sh")  # TODO: Is this fine? maybe use execProcess
        quit(-1)
    logger.warning("The system is shutting down")
    logger.info("Processing shutdown runlevel")
    # TODO
@ -89,22 +98,15 @@ proc nimDExit*(logger: Logger, code: int, emerg: bool = true) =
    except:
        logger.error(&"An error has occurred while calling shutdown handlers. Error -> {getCurrentExceptionMsg()}")
        # Note: continues calling handlers!
-    if emerg:
+    logger.info("Terminating child processes with SIGTERM")
-        # We're in emergency mode: do not crash the kernel, spawn a shell and exit
+    logger.debug(&"Waiting up to {sigTermDelay} seconds for the kernel to deliver signals")
-        logger.fatal("NimD has entered emergency mode and cannot continue. You will be now (hopefully) dropped in a root shell: you're on your own. May the force be with you")
+    discard posix.kill(SIGTERM, -1)  # The kernel handles this for us asynchronously
-        discard execShellCmd("/bin/sh")  # TODO: Is this fine? maybe use execProcess
+    var t = cpuTime()
-    else:
+    # We wait some time for the signals to propagate
-        logger.info("Terminating child processes with SIGTERM")
+    while anyUserlandProcessLeft() or cpuTime() - t >= sigTermDelay:
-        logger.debug(&"Waiting {sigTermDelay} seconds for the kernel to deliver signals")
+        sleepSeconds(0.25)
-        discard posix.kill(SIGTERM, -1)  # The kernel handles this for us asynchronously
+    if anyUserlandProcessLeft():
-        var t = cpuTime()
+        logger.info("Terminating child processes with SIGKILL")
-        # We wait some time for the signals to propagate
+        discard posix.kill(SIGKILL, -1)
-        while anyUserlandProcessLeft():
+    logger.warning("Shutdown procedure complete, sending final termination signal")
-            sleepSeconds(0.25)
+    quit(code)
            if cpuTime() - t >= sigTermDelay:
                break
        if anyUserlandProcessLeft():
            logger.info("Terminating child processes with SIGKILL")
            discard posix.kill(SIGKILL, -1)
        logger.warning("Shutdown procedure complete, sending final termination signal")
    quit(code)
--- a/src/main.nim
+++ b/src/main.nim
@ -39,8 +39,8 @@ proc main(logger: Logger, mountDisks: bool = true, fstab: string = "/etc/fstab")
        logger.fatal(&"NimD must run as root, but current user id is {uid}")
        nimDExit(logger, EPERM)   # EPERM - Operation not permitted
    logger.trace("Setting up signal handlers")
-    onSignal(SIGABRT, SIGALRM, SIGHUP, SIGILL, SIGKILL, SIGQUIT, SIGSTOP, SIGTSTP,
+    onSignal(SIGABRT, SIGALRM, SIGHUP, SIGILL, SIGKILL, SIGQUIT, SIGSTOP, SIGSEGV, SIGTSTP,
-            SIGTRAP, SIGTERM, SIGPIPE, SIGUSR1, SIGUSR2, 6, SIGFPE, SIGBUS, SIGURG, SIGINT):  # 6 is SIGIOT
+            SIGTRAP, SIGPIPE, SIGUSR1, SIGUSR2, 6, SIGFPE, SIGBUS, SIGURG, SIGTERM, SIGINT):  # 6 is SIGIOT
        # Can't capture local variables because this implicitly generates
        # a noconv procedure, so we use getDefaultLogger() instead. Must find
        # a better solution long-term because we need the configuration from
@ -51,6 +51,9 @@ proc main(logger: Logger, mountDisks: bool = true, fstab: string = "/etc/fstab")
        # One of the key features of an init system is reaping child
        # processes!
        reapProcess(getDefaultLogger())
    onSignal(SIGINT):
        # Temporary
        nimDExit(getDefaultLogger(), 131, emerg=true)
    addSymlink(newSymlink(dest="/dev/fd", source="/proc/self/fd"))
    addSymlink(newSymlink(dest="/dev/fd/0", source="/proc/self/fd/0"))
    addSymlink(newSymlink(dest="/dev/fd/1", source="/proc/self/fd/1"))
@ -90,12 +93,38 @@ proc main(logger: Logger, mountDisks: bool = true, fstab: string = "/etc/fstab")
    logger.debug(&"Hostname was set to '{setHostname(logger)}'")
    logger.info("Creating symlinks")
    createSymlinks(logger)
    logger.info("Creating directories")
    createDirectories(logger)
    logger.debug("Entering critical fork() section: blocking signals")
    var sigset: Sigset
    # TODO
    logger.info("Processing boot runlevel")
-    addService(newService(name="Test", description="prints owo", exec="/bin/echo owo",
+    addService(newService(name="echoer", description="prints owo", exec="/bin/echo owo",
                          runlevel=Boot, kind=Oneshot, workDir=getCurrentDir(),
                          supervised=false, restartOnFailure=false, restartDelay=0))
-    startServices(logger, workers=1, level=Boot)
+    addService(newService(name="sleeper", description="la mamma di licenziato", 
                         exec="/usr/bin/sleep 10", supervised=true, restartOnFailure=true,
                         restartDelay=5, runlevel=Boot, workDir="/home", kind=Simple))
    addService(newService(name="errorer", description="la mamma di gavd", 
                         exec="/bin/false", supervised=true, restartOnFailure=true,
                         restartDelay=5, runlevel=Boot, workDir="/", kind=Simple))
    startServices(logger, workers=2, level=Boot)
    logger.debug("Setting up real signal handlers")
    onSignal(SIGABRT, SIGALRM, SIGHUP, SIGILL, SIGKILL, SIGQUIT, SIGSTOP, SIGSEGV, SIGTSTP,
            SIGTRAP, SIGPIPE, SIGUSR1, SIGUSR2, 6, SIGFPE, SIGBUS, SIGURG, SIGTERM):  # 6 is SIGIOT
        # Can't capture local variables because this implicitly generates
        # a noconv procedure, so we use getDefaultLogger() instead. Must find
        # a better solution long-term because we need the configuration from
        # our own logger object (otherwise we'd always create a new one and
        # never switch our logs to file once booting is completed)
        getDefaultLogger().warning(&"Ignoring signal {sig} ({strsignal(sig)})")  # Nim injects the variable "sig" into the scope. Gotta love those macros
    onSignal(SIGCHLD):
        # One of the key features of an init system is reaping child
        # processes!
        reapProcess(getDefaultLogger())
    onSignal(SIGINT):
        # Temporary
        nimDExit(getDefaultLogger(), 131, emerg=true)
    logger.debug("Starting main loop")
    mainLoop(logger)
@ -144,7 +173,6 @@ when isMainModule:
    try:
        main(logger)
    except:
        raise
        logger.fatal(&"A fatal unrecoverable error has occurred during startup and NimD cannot continue: {getCurrentExceptionMsg()}")
        nimDExit(logger, 131)  # ENOTRECOVERABLE - State not recoverable
        # This will almost certainly cause the kernel to crash with an error the likes of "Kernel not syncing, attempted to kill init!",
--- a/src/util/logging.nim
+++ b/src/util/logging.nim
@ -15,6 +15,7 @@
 # A simple logging module inspired by python's own logging facility
 import terminal
 import strformat
 import posix
 import times
@ -73,47 +74,55 @@ proc getDefaultLogger*(): Logger =
    ## level that writes the given message to the
    ## standard error with some basic info like the
    ## current date and time and the log level
-    
+        
    setStdIoUnbuffered()   # Just in case
    proc logTrace(self: LogHandler, logger: Logger, message: string) =
        setForegroundColor(fgMagenta)
-        stderr.writeLine(&"""[{fromUnix(getTime().toUnixFloat().int).format("d/M/yyyy HH:mm:ss")} - TRACE] {message}""")
+        stderr.writeLine(&"""[{fromUnix(getTime().toUnixFloat().int).format("d/M/yyyy HH:mm:ss")} - TRACE ({posix.getpid()})] {message}""")
        stderr.flushFile()
        setForegroundColor(fgDefault)
    proc logDebug(self: LogHandler, logger: Logger, message: string) =
        setForegroundColor(fgCyan)
-        stderr.writeLine(&"""[{fromUnix(getTime().toUnixFloat().int).format("d/M/yyyy HH:mm:ss")} - DEBUG] {message}""")
+        stderr.writeLine(&"""[{fromUnix(getTime().toUnixFloat().int).format("d/M/yyyy HH:mm:ss")} - DEBUG ({posix.getpid()})] {message}""")
        stderr.flushFile()
        setForegroundColor(fgDefault)
    proc logInfo(self: LogHandler, logger: Logger, message: string) =
        setForegroundColor(fgGreen)
-        stderr.writeLine(&"""[{fromUnix(getTime().toUnixFloat().int).format("d/M/yyyy HH:mm:ss")} - INFO] {message}""")
+        stderr.writeLine(&"""[{fromUnix(getTime().toUnixFloat().int).format("d/M/yyyy HH:mm:ss")} - INFO ({posix.getpid()})] {message}""")
        stderr.flushFile()
        setForegroundColor(fgDefault)
    proc logWarning(self: LogHandler, logger: Logger, message: string) =
        setForegroundColor(fgYellow)
-        stderr.writeLine(&"""[{fromUnix(getTime().toUnixFloat().int).format("d/M/yyyy HH:mm:ss")} - WARNING] {message}""")
+        stderr.writeLine(&"""[{fromUnix(getTime().toUnixFloat().int).format("d/M/yyyy HH:mm:ss")} - WARNING ({posix.getpid()})] {message}""")
        stderr.flushFile()
        setForegroundColor(fgDefault)
    proc logError(self: LogHandler, logger: Logger, message: string) =
        setForegroundColor(fgRed)
-        stderr.writeLine(&"""[{fromUnix(getTime().toUnixFloat().int).format("d/M/yyyy HH:mm:ss")} - ERROR] {message}""")
+        stderr.writeLine(&"""[{fromUnix(getTime().toUnixFloat().int).format("d/M/yyyy HH:mm:ss")} - ERROR ({posix.getpid()})] {message}""")
        stderr.flushFile()
        setForegroundColor(fgDefault)
    proc logCritical(self: LogHandler, logger: Logger, message: string) =
        setForegroundColor(fgRed)
-        stderr.writeLine(&"""[{fromUnix(getTime().toUnixFloat().int).format("d/M/yyyy HH:mm:ss")} - CRITICAL] {message}""")
+        stderr.writeLine(&"""[{fromUnix(getTime().toUnixFloat().int).format("d/M/yyyy HH:mm:ss")} - CRITICAL ({posix.getpid()})] {message}""")
        stderr.flushFile()
        setForegroundColor(fgDefault)
    proc logFatal(self: LogHandler, logger: Logger, message: string) =
        setForegroundColor(fgBlack)
        setBackgroundColor(bgRed)
-        stderr.write(&"""[{fromUnix(getTime().toUnixFloat().int).format("d/M/yyyy HH:mm:ss")} - FATAL]""")
+        stderr.write(&"""[{fromUnix(getTime().toUnixFloat().int).format("d/M/yyyy HH:mm:ss")} - FATAL ({posix.getpid()})]""")
        setForegroundColor(fgRed)
        setBackgroundColor(bgDefault)
        stderr.writeline(&" {message}")
        setForegroundColor(fgDefault)
        stderr.flushFile()
    result = newLogger()
    result.addHandler(createHandler(logTrace, LogLevel.Trace))