diff --git a/docs/bytecode.md b/docs/bytecode.md index 8c07ac3..974b5f9 100644 --- a/docs/bytecode.md +++ b/docs/bytecode.md @@ -1,7 +1,8 @@ # Peon - Bytecode Specification This document aims to document peon's bytecode as well as how it is (de-)serialized to/from files and -other file-like objects. +other file-like objects. Note that the segments in a bytecode dump appear in the order they are listed +in this document. ## Code Structure @@ -9,12 +10,12 @@ A peon program is compiled into a tightly packed sequence of bytes that contain the VM needs to execute said program. There is no dependence between the frontend and the backend outside of the bytecode format (which is implemented in a separate serialiazer module) to allow for maximum modularity. -A peon bytecode dump contains: +A peon bytecode file contains the following: - Constants -- The bytecode itself -- Debugging information -- File and version metadata +- The program's code +- Debugging information (file and version metadata, module info. Optional) + ## File Headers @@ -34,7 +35,7 @@ in release builds. ### Line data segment The line data segment contains information about each instruction in the code segment and associates them -1:1 with a line number in the original source file for easier debugging using run-length encoding. The section's +1:1 with a line number in the original source file for easier debugging using run-length encoding. The segment's size is fixed and is encoded at the beginning as a sequence of 4 bytes (i.e. a single 32 bit integer). The data in this segment can be decoded as explained in [this file](../src/frontend/compiler/targgets/bytecode/opcodes.nim#L29), which is quoted below: @@ -57,7 +58,7 @@ below: This segment contains details about each function in the original file. The segment's size is fixed and is encoded at the beginning as a sequence of 4 bytes (i.e. a single 32 bit integer). The data in this segment can be decoded as explained -in [this file](../src/frontend/compiler/targgets/bytecode/opcodes.nim#L39), which is quoted below: +in [this file](../src/frontend/compiler/targets/bytecode/opcodes.nim#L39), which is quoted below: ``` [...] @@ -74,6 +75,26 @@ in [this file](../src/frontend/compiler/targgets/bytecode/opcodes.nim#L39), whic [...] ``` +### Modules segment + +This segment contains details about the modules that make up the original source code which produced a given bytecode dump. +The data in this segment can be decoded as explained in [this file](../src/frontend/compiler/targets/bytecode/opcodes.nim#L49), which is quoted below: +``` +[...] +## modules contains information about all the peon modules that the compiler has encountered, +## along with their start/end offset in the code. Unlike other bytecode-compiled languages like +## Python, peon does not produce a bytecode file for each separate module it compiles: everything +## is contained within a single binary blob. While this simplifies the implementation and makes +## bytecode files entirely "self-hosted", it also means that the original module information is +## lost: this segment serves to fix that. The segment's size is encoded at the beginning as a 4-byte +## sequence (i.e. a single 32-bit integer) and its encoding is similar to that of the functions segment: +## - First, the position into the bytecode where the module begins is encoded (as a 3 byte integer) +## - Second, the position into the bytecode where the module ends is encoded (as a 3 byte integer) +## - Lastly, the module's name is encoded in ASCII, prepended with its size as a 2-byte integer +[...] +``` + + ## Constant segment The constant segment contains all the read-only values that the code will need at runtime, such as hardcoded @@ -87,6 +108,6 @@ real-world scenarios it likely won't be. ## Code segment -The code segment contains the linear sequence of bytecode instructions of a peon program. It is to be read directly -and without modifications. The segment's size is fixed and is encoded at the beginning as a sequence of 3 bytes +The code segment contains the linear sequence of bytecode instructions of a peon program to be fed directly to +peon's virtual machine. The segment's size is fixed and is encoded at the beginning as a sequence of 3 bytes (i.e. a single 24 bit integer). All the instructions are documented [here](../src/frontend/compiler/targgets/bytecode/opcodes.nim) \ No newline at end of file diff --git a/src/frontend/compiler/compiler.nim b/src/frontend/compiler/compiler.nim index c80e994..a1be635 100644 --- a/src/frontend/compiler/compiler.nim +++ b/src/frontend/compiler/compiler.nim @@ -134,7 +134,7 @@ type node*: Declaration # Who is this name exported to? (Only makes sense if isPrivate # equals false) - exportedTo*: HashSet[Name] + exportedTo*: HashSet[string] # Has the compiler generated this name internally or # does it come from user code? isReal*: bool @@ -212,7 +212,7 @@ type # The module importing us, if any parentModule*: Name # Currently imported modules - modules*: HashSet[Name] + modules*: HashSet[string] TypedNode* = ref object ## A wapper for AST nodes @@ -354,7 +354,7 @@ proc resolve*(self: Compiler, name: string): Name = # module, so we definitely can't # use it continue - elif self.currentModule in obj.exportedTo: + elif self.currentModule.path in obj.exportedTo: # The name is public in its owner # module and said module has explicitly # exported it to us: we can use it @@ -713,7 +713,7 @@ method findByName*(self: Compiler, name: string): seq[Name] = for obj in reversed(self.names): if obj.ident.token.lexeme == name: if obj.owner.path != self.currentModule.path: - if obj.isPrivate or self.currentModule notin obj.exportedTo: + if obj.isPrivate or self.currentModule.path notin obj.exportedTo: continue result.add(obj) @@ -727,11 +727,13 @@ method findInModule*(self: Compiler, name: string, module: Name): seq[Name] = ## the current one or not if name == "": for obj in reversed(self.names): - if not obj.isPrivate and obj.owner == module: + if obj.owner.isNil(): + continue + if not obj.isPrivate and obj.owner.path == module.path: result.add(obj) else: for obj in self.findInModule("", module): - if obj.ident.token.lexeme == name and self.currentModule in obj.exportedTo: + if obj.ident.token.lexeme == name and self.currentModule.path in obj.exportedTo: result.add(obj) @@ -1034,7 +1036,7 @@ proc declare*(self: Compiler, node: ASTNode): Name {.discardable.} = break if name.ident.token.lexeme != declaredName: continue - if name.owner != n.owner and (name.isPrivate or n.owner notin name.exportedTo): + if name.owner != n.owner and (name.isPrivate or n.owner.path notin name.exportedTo): continue if name.kind in [NameKind.Var, NameKind.Module, NameKind.CustomType, NameKind.Enum]: if name.depth < n.depth: diff --git a/src/frontend/compiler/targets/bytecode/opcodes.nim b/src/frontend/compiler/targets/bytecode/opcodes.nim index 26645dd..0213af3 100644 --- a/src/frontend/compiler/targets/bytecode/opcodes.nim +++ b/src/frontend/compiler/targets/bytecode/opcodes.nim @@ -46,10 +46,21 @@ type ## - After that follows the argument count as a 1 byte integer ## - Lastly, the function's name (optional) is encoded in ASCII, prepended with ## its size as a 2-byte integer + ## modules contains information about all the peon modules that the compiler has encountered, + ## along with their start/end offset in the code. Unlike other bytecode-compiled languages like + ## Python, peon does not produce a bytecode file for each separate module it compiles: everything + ## is contained within a single binary blob. While this simplifies the implementation and makes + ## bytecode files entirely "self-hosted", it also means that the original module information is + ## lost: this segment serves to fix that. The segment's size is encoded at the beginning as a 4-byte + ## sequence (i.e. a single 32-bit integer) and its encoding is similar to that of the functions segment: + ## - First, the position into the bytecode where the module begins is encoded (as a 3 byte integer) + ## - Second, the position into the bytecode where the module ends is encoded (as a 3 byte integer) + ## - Lastly, the module's name is encoded in ASCII, prepended with its size as a 2-byte integer consts*: seq[uint8] code*: seq[uint8] lines*: seq[int] functions*: seq[uint8] + modules*: seq[uint8] OpCode* {.pure.} = enum ## Enum of Peon's bytecode opcodes diff --git a/src/frontend/compiler/targets/bytecode/target.nim b/src/frontend/compiler/targets/bytecode/target.nim index 3abc092..fe21f23 100644 --- a/src/frontend/compiler/targets/bytecode/target.nim +++ b/src/frontend/compiler/targets/bytecode/target.nim @@ -1006,7 +1006,7 @@ proc terminateProgram(self: BytecodeCompiler, pos: int) = self.emitByte(ReplExit, self.peek().token.line) else: self.emitByte(OpCode.Return, self.peek().token.line) - self.emitByte(0, self.peek().token.line) # Entry point has no return value (TODO: Add easter eggs, cuz why not) + self.emitByte(0, self.peek().token.line) # Entry point has no return value self.patchReturnAddress(pos) @@ -1478,8 +1478,9 @@ method lambdaExpr(self: BytecodeCompiler, node: LambdaExpr, compile: bool = true line: node.token.line, kind: NameKind.Function, belongsTo: function, - isReal: true) - if compile and node notin self.lambdas: + isReal: true, + ) + if compile and node notin self.lambdas and not node.body.isNil(): self.lambdas.add(node) let jmp = self.emitJump(JumpForwards, node.token.line) if BlockStmt(node.body).code.len() == 0: @@ -1687,7 +1688,7 @@ proc importStmt(self: BytecodeCompiler, node: ImportStmt, compile: bool = true) # Importing a module automatically exports # its public names to us for name in self.findInModule("", module): - name.exportedTo.incl(self.currentModule) + name.exportedTo.incl(self.currentModule.path) except IOError: self.error(&"could not import '{module.ident.token.lexeme}': {getCurrentExceptionMsg()}") except OSError: @@ -1705,22 +1706,22 @@ proc exportStmt(self: BytecodeCompiler, node: ExportStmt, compile: bool = true) var name = self.resolveOrError(node.name) if name.isPrivate: self.error("cannot export private names") - name.exportedTo.incl(self.parentModule) + name.exportedTo.incl(self.parentModule.path) case name.kind: of NameKind.Module: # We need to export everything # this module defines! for name in self.findInModule("", name): - name.exportedTo.incl(self.parentModule) + name.exportedTo.incl(self.parentModule.path) of NameKind.Function: # Only exporting a single function (or, well # all of its implementations) for name in self.findByName(name.ident.token.lexeme): if name.kind != NameKind.Function: continue - name.exportedTo.incl(self.parentModule) + name.exportedTo.incl(self.parentModule.path) else: - discard + self.error("unsupported export type") proc breakStmt(self: BytecodeCompiler, node: BreakStmt) = @@ -2073,6 +2074,7 @@ proc compile*(self: BytecodeCompiler, ast: seq[Declaration], file: string, lines self.disabledWarnings = disabledWarnings self.showMismatches = showMismatches self.mode = mode + let start = self.chunk.code.len() if not incremental: self.jumps = @[] let pos = self.beginProgram() @@ -2081,8 +2083,6 @@ proc compile*(self: BytecodeCompiler, ast: seq[Declaration], file: string, lines while not self.done(): self.declaration(Declaration(self.step())) self.terminateProgram(pos) - # TODO: REPL is broken, we need a new way to make - # incremental compilation resume from where it stopped! result = self.chunk @@ -2100,7 +2100,7 @@ proc compileModule(self: BytecodeCompiler, module: Name) = break elif i == searchPath.high(): self.error(&"""could not import '{path}': module not found""") - if self.modules.contains(module): + if self.modules.contains(module.path): return let source = readFile(path) let current = self.current @@ -2115,11 +2115,19 @@ proc compileModule(self: BytecodeCompiler, module: Name) = self.replMode = false self.parentModule = currentModule self.currentModule = module + let start = self.chunk.code.len() discard self.compile(self.parser.parse(self.lexer.lex(source, path), path, self.lexer.getLines(), self.lexer.getSource(), persist=true), path, self.lexer.getLines(), self.lexer.getSource(), chunk=self.chunk, incremental=true, isMainModule=false, self.disabledWarnings, self.showMismatches, self.mode) + # Mark the end of a new module + self.chunk.modules.extend(start.toTriple()) + self.chunk.modules.extend(self.chunk.code.high().toTriple()) + # I swear to god if someone ever creates a peon module with a name that's + # longer than 2^16 bytes I will hit them with a metal pipe. Mark my words + self.chunk.modules.extend(self.currentModule.ident.token.lexeme.len().toDouble()) + self.chunk.modules.extend(self.currentModule.ident.token.lexeme.toBytes()) module.file = path # No need to save the old scope depth: import statements are # only allowed at the top level! @@ -2133,4 +2141,4 @@ proc compileModule(self: BytecodeCompiler, module: Name) = self.replMode = replMode self.lines = lines self.source = src - self.modules.incl(module) + self.modules.incl(module.path) diff --git a/src/frontend/compiler/targets/bytecode/util/debugger.nim b/src/frontend/compiler/targets/bytecode/util/debugger.nim index cb870ce..5d96303 100644 --- a/src/frontend/compiler/targets/bytecode/util/debugger.nim +++ b/src/frontend/compiler/targets/bytecode/util/debugger.nim @@ -22,12 +22,15 @@ import std/terminal type - Function = ref object - start, stop, bottom, argc: int + Function = object + start, stop, argc: int + name: string + Module = object + start, stop: int name: string - started, stopped: bool Debugger* = ref object chunk: Chunk + modules: seq[Module] functions: seq[Function] current: int @@ -66,21 +69,38 @@ proc checkFunctionStart(self: Debugger, n: int) = ## Checks if a function begins at the given ## bytecode offset for i, e in self.functions: - if n == e.start and not (e.started or e.stopped): - e.started = true + # Avoids duplicate output + if n == e.start: styledEcho fgBlue, "\n==== Peon Bytecode Disassembler - Function Start ", fgYellow, &"'{e.name}' ", fgBlue, "(", fgYellow, $i, fgBlue, ") ====" styledEcho fgGreen, "\t- Start offset: ", fgYellow, $e.start styledEcho fgGreen, "\t- End offset: ", fgYellow, $e.stop - styledEcho fgGreen, "\t- Argument count: ", fgYellow, $e.argc + styledEcho fgGreen, "\t- Argument count: ", fgYellow, $e.argc, "\n" proc checkFunctionEnd(self: Debugger, n: int) = ## Checks if a function ends at the given ## bytecode offset for i, e in self.functions: - if n == e.stop and e.started and not e.stopped: - e.stopped = true + if n == e.stop: styledEcho fgBlue, "\n==== Peon Bytecode Disassembler - Function End ", fgYellow, &"'{e.name}' ", fgBlue, "(", fgYellow, $i, fgBlue, ") ====" + + +proc checkModuleStart(self: Debugger, n: int) = + ## Checks if a module begins at the given + ## bytecode offset + for i, m in self.modules: + if m.start == n: + styledEcho fgBlue, "\n==== Peon Bytecode Disassembler - Module Start ", fgYellow, &"'{m.name}' ", fgBlue, "(", fgYellow, $i, fgBlue, ") ====" + styledEcho fgGreen, "\t- Start offset: ", fgYellow, $m.start + styledEcho fgGreen, "\t- End offset: ", fgYellow, $m.stop, "\n" + + +proc checkModuleEnd(self: Debugger, n: int) = + ## Checks if a module ends at the given + ## bytecode offset + for i, m in self.modules: + if m.stop == n: + styledEcho fgBlue, "\n==== Peon Bytecode Disassembler - Module End ", fgYellow, &"'{m.name}' ", fgBlue, "(", fgYellow, $i, fgBlue, ") ====" proc simpleInstruction(self: Debugger, instruction: OpCode) = @@ -94,9 +114,6 @@ proc simpleInstruction(self: Debugger, instruction: OpCode) = else: stdout.styledWriteLine(fgYellow, "No") self.current += 1 - self.checkFunctionEnd(self.current - 2) - self.checkFunctionEnd(self.current - 1) - self.checkFunctionEnd(self.current) proc stackTripleInstruction(self: Debugger, instruction: OpCode) = @@ -168,20 +185,27 @@ proc jumpInstruction(self: Debugger, instruction: OpCode) = self.current += 4 while self.chunk.code[self.current] == NoOp.uint8: inc(self.current) - for i in countup(orig, self.current + 1): - self.checkFunctionStart(i) proc disassembleInstruction*(self: Debugger) = ## Takes one bytecode instruction and prints it + let opcode = OpCode(self.chunk.code[self.current]) + self.checkModuleStart(self.current) + self.checkFunctionStart(self.current) printDebug("Offset: ") stdout.styledWriteLine(fgYellow, $(self.current)) printDebug("Line: ") stdout.styledWriteLine(fgYellow, &"{self.chunk.getLine(self.current)}") - var opcode = OpCode(self.chunk.code[self.current]) case opcode: of simpleInstructions: self.simpleInstruction(opcode) + # Functions (and modules) only have a single return statement at the + # end of their body, so we never execute this more than once per module/function + if opcode == Return: + # -2 to skip the hardcoded argument to return + # and the increment by simpleInstruction() + self.checkFunctionEnd(self.current - 2) + self.checkModuleEnd(self.current - 1) of constantInstructions: self.constantInstruction(opcode) of stackDoubleInstructions: @@ -197,7 +221,9 @@ proc disassembleInstruction*(self: Debugger) = else: echo &"DEBUG - Unknown opcode {opcode} at index {self.current}" self.current += 1 - + + + proc parseFunctions(self: Debugger) = ## Parses function information in the chunk @@ -206,7 +232,7 @@ proc parseFunctions(self: Debugger) = name: string idx = 0 size = 0 - while idx < len(self.chunk.functions) - 1: + while idx < self.chunk.functions.high(): start = int([self.chunk.functions[idx], self.chunk.functions[idx + 1], self.chunk.functions[idx + 2]].fromTriple()) idx += 3 stop = int([self.chunk.functions[idx], self.chunk.functions[idx + 1], self.chunk.functions[idx + 2]].fromTriple()) @@ -220,15 +246,36 @@ proc parseFunctions(self: Debugger) = self.functions.add(Function(start: start, stop: stop, argc: argc, name: name)) +proc parseModules(self: Debugger) = + ## Parses module information in the chunk + var + start, stop: int + name: string + idx = 0 + size = 0 + while idx < self.chunk.modules.high(): + start = int([self.chunk.modules[idx], self.chunk.modules[idx + 1], self.chunk.modules[idx + 2]].fromTriple()) + idx += 3 + stop = int([self.chunk.modules[idx], self.chunk.modules[idx + 1], self.chunk.modules[idx + 2]].fromTriple()) + idx += 3 + size = int([self.chunk.modules[idx], self.chunk.modules[idx + 1]].fromDouble()) + idx += 2 + name = self.chunk.modules[idx..