From 1a0587d08b9c68c5b3b578b64d95a8fb644ff4dc Mon Sep 17 00:00:00 2001 From: Mattia Giambirtone Date: Fri, 20 May 2022 15:47:04 +0200 Subject: [PATCH] Minor style changes, removed findImpl from compiler, made the VM use the multibyte utilities, bytecode chunks now no longer store AST node objects and use a stream of bytes instead, fixed issues with endScope() in the compiler which would not pop properly from self.names, fixed issues with blockStmt in parser, added more multibyte utilities --- src/backend/vm.nim | 15 ++- src/config.nim | 4 +- src/frontend/compiler.nim | 51 +++++----- src/frontend/meta/bytecode.nim | 57 ++--------- src/frontend/parser.nim | 6 +- src/test.nim | 10 +- src/util/debugger.nim | 4 - src/util/multibyte.nim | 27 ++++- src/util/serializer.nim | 175 +++++++++------------------------ 9 files changed, 117 insertions(+), 232 deletions(-) diff --git a/src/backend/vm.nim b/src/backend/vm.nim index 9409808..25d97b4 100644 --- a/src/backend/vm.nim +++ b/src/backend/vm.nim @@ -16,6 +16,7 @@ import types import ../config import ../frontend/meta/bytecode +import ../util/multibyte type @@ -106,8 +107,7 @@ proc readShort(self: PeonVM): uint16 = ## bytecode and returns them ## as an unsigned 16 bit ## integer - var arr: array[2, uint8] = [self.readByte(), self.readByte()] - copyMem(result.addr, unsafeAddr(arr), sizeof(arr)) + return [self.readByte(), self.readByte()].fromDouble() proc readLong(self: PeonVM): uint32 = @@ -117,8 +117,7 @@ proc readLong(self: PeonVM): uint32 = ## integer. Note however that ## the boundary is capped at ## 24 bits instead of 32 - var arr: array[3, uint8] = [self.readByte(), self.readByte(), self.readByte()] - copyMem(result.addr, unsafeAddr(arr), sizeof(arr)) + return uint32([self.readByte(), self.readByte(), self.readByte()].fromTriple()) proc readInt64(self: PeonVM, idx: int): PeonObject = @@ -126,8 +125,8 @@ proc readInt64(self: PeonVM, idx: int): PeonObject = ## chunk's constant table and ## returns a Peon object. Assumes ## the constant is an Int64 - var arr = [self.chunk.byteConsts[idx], self.chunk.byteConsts[idx + 1], - self.chunk.byteConsts[idx + 2], self.chunk.byteConsts[idx + 3]] + var arr = [self.chunk.consts[idx], self.chunk.consts[idx + 1], + self.chunk.consts[idx + 2], self.chunk.consts[idx + 3]] result = PeonObject(kind: Int64) copyMem(result.long.addr, arr.addr, sizeof(arr)) @@ -137,8 +136,8 @@ proc readUInt64(self: PeonVM, idx: int): PeonObject = ## chunk's constant table and ## returns a Peon object. Assumes ## the constant is an UInt64 - var arr = [self.chunk.byteConsts[idx], self.chunk.byteConsts[idx + 1], - self.chunk.byteConsts[idx + 2], self.chunk.byteConsts[idx + 3]] + var arr = [self.chunk.consts[idx], self.chunk.consts[idx + 1], + self.chunk.consts[idx + 2], self.chunk.consts[idx + 3]] result = PeonObject(kind: UInt64) copyMem(result.uLong.addr, arr.addr, sizeof(arr)) diff --git a/src/config.nim b/src/config.nim index c1716ef..3a464d1 100644 --- a/src/config.nim +++ b/src/config.nim @@ -30,7 +30,7 @@ const PEON_COMMIT_HASH* = "ed79385e2a93100331697f26a4a90157e60ad27a" when len(PEON_COMMIT_HASH) != 40: {.fatal: "The git commit hash must be exactly 40 characters long".} const PEON_BRANCH* = "master" -when len(PEON_BRANCH) >= 255: +when len(PEON_BRANCH) > 255: {.fatal: "The git branch name's length must be less than or equal to 255 characters".} const DEBUG_TRACE_VM* = false # Traces VM execution const SKIP_STDLIB_INIT* = false # Skips stdlib initialization (can be imported manually) @@ -48,7 +48,7 @@ Basic usage ----------- $ peon Opens an interactive session (REPL) -$ peon file.pe Runs the given Peon source file +$ peon file.pn Runs the given Peon source file Command-line options -------------------- diff --git a/src/frontend/compiler.nim b/src/frontend/compiler.nim index ce1565b..850b0d5 100644 --- a/src/frontend/compiler.nim +++ b/src/frontend/compiler.nim @@ -128,7 +128,7 @@ type # inside an implicit try/finally block # and add this code in the finally branch. # This sequence is emptied each time a - # fun declaration is compiled and stores only + # function declaration is compiled and stores only # deferred code for the current function (may # be empty) deferred: seq[uint8] @@ -185,8 +185,7 @@ proc done(self: Compiler): bool = result = self.current > self.ast.high() -proc error(self: Compiler, message: string) {.raises: [CompileError, - ValueError].} = +proc error(self: Compiler, message: string) {.raises: [CompileError, ValueError].} = ## Raises a formatted CompileError exception var tok = self.getCurrentNode().token raise newException(CompileError, &"A fatal error occurred while compiling '{self.file}', module '{self.currentModule}' line {tok.line} at '{tok.lexeme}' -> {message}") @@ -231,10 +230,20 @@ proc emitBytes(self: Compiler, bytarr: openarray[uint8]) = self.emitByte(b) -proc makeConstant(self: Compiler, val: Expression, kind: Type): array[3, uint8] = +proc makeConstant(self: Compiler, val: Expression, typ: Type): array[3, uint8] = ## Adds a constant to the current chunk's constant table ## and returns its index as a 3-byte array of uint8s - result = self.chunk.addConstant(val, kind) + case typ.kind: + of UInt8, Int8: + result = self.chunk.writeConstant([uint8(parseInt(val.token.lexeme))]) + of Int16, UInt16: + result = self.chunk.writeConstant(parseInt(val.token.lexeme).toDouble()) + of Int32, UInt32: + result = self.chunk.writeConstant(parseInt(val.token.lexeme).toQuad()) + of Int64, UInt64: + result = self.chunk.writeConstant(parseInt(val.token.lexeme).toLong()) + else: + discard proc emitConstant(self: Compiler, obj: Expression, kind: Type) = @@ -816,23 +825,9 @@ proc identifier(self: Compiler, node: IdentExpr) = self.emitBytes(self.closedOver.high().toTriple()) -proc findImpl(self: Compiler, node: FunDecl): seq[Name] = - ## Looks for functions matching the given declaration - ## in the code that has been compiled so far. - ## Returns a list of each matching name object - for obj in reversed(self.names): - # Scopes are indexed backwards! - case obj.valueType.kind: - of Function: - if self.compareTypes(obj.valueType, self.inferType(node)): - result.add(obj) - else: - continue - - proc findByName(self: Compiler, name: string): seq[Name] = ## Looks for objects that have been already declared - ## with the given name + ## with the given name. Returns all objects that apply for obj in reversed(self.names): if obj.name.token.lexeme == name: result.add(obj) @@ -888,12 +883,14 @@ proc beginScope(self: Compiler) = proc endScope(self: Compiler) = ## Ends the current local scope - if self.scopeDepth < 0: - self.error("cannot call endScope with scopeDepth < 0 (This is an internal error and most likely a bug)") + if self.scopeDepth == 0: + self.error("cannot call endScope with scopeDepth == 0 (This is an internal error and most likely a bug)") + dec(self.scopeDepth) var popped: int = 0 - for ident in reversed(self.names): + for i, ident in reversed(self.names): if ident.depth > self.scopeDepth: inc(popped) + self.names.delete(self.names.len() - i) if not self.enableOptimizations: # All variables with a scope depth larger than the current one # are now out of scope. Begone, you're now homeless! @@ -918,9 +915,6 @@ proc endScope(self: Compiler) = elif popped == 1: # We only emit PopN if we're popping more than one value self.emitByte(Pop) - for _ in countup(0, popped - 1): - discard self.names.pop() - dec(self.scopeDepth) proc blockStmt(self: Compiler, node: BlockStmt) = @@ -1273,8 +1267,7 @@ proc compile*(self: Compiler, ast: seq[Declaration], file: string): Chunk = self.declaration(Declaration(self.step())) if self.ast.len() > 0: # *Technically* an empty program is a valid program - self.endScope() self.emitByte(OpCode.Return) # Exits the VM's main loop when used at the global scope result = self.chunk - if self.ast.len() > 0 and self.scopeDepth != -1: - self.error(&"invalid state: invalid scopeDepth value (expected -1, got {self.scopeDepth}), did you forget to call endScope/beginScope?") + if self.ast.len() > 0 and self.scopeDepth != 0: + self.error(&"invalid state: invalid scopeDepth value (expected 0, got {self.scopeDepth}), did you forget to call endScope/beginScope?") diff --git a/src/frontend/meta/bytecode.nim b/src/frontend/meta/bytecode.nim index 2911ab3..a41b15b 100644 --- a/src/frontend/meta/bytecode.nim +++ b/src/frontend/meta/bytecode.nim @@ -13,24 +13,16 @@ # limitations under the License. ## Low level bytecode implementation details -import ast -import errors import strutils import strformat import ../../util/multibyte -import ../compiler - -export ast type Chunk* = ref object ## A piece of bytecode. - ## consts represents the high-level constants table the code is - ## referring to and is only meaningful at compile time (not stored - ## in bytecode dumps!). ## byteConsts is used when serializing to/from a bytecode stream. ## code is the linear sequence of compiled bytecode instructions. ## lines maps bytecode instructions to line numbers using Run @@ -46,8 +38,7 @@ type ## are 3 and 4" ## This is more efficient than using the naive approach, which would encode ## the same line number multiple times and waste considerable amounts of space. - consts*: seq[Expression] - byteConsts*: seq[uint8] + consts*: seq[uint8] code*: seq[uint8] lines*: seq[int] reuseConsts*: bool @@ -223,42 +214,10 @@ proc getLine*(self: Chunk, idx: int): int = raise newException(IndexDefect, "index out of range") -proc findOrAddConstant(self: Chunk, constant: Expression, kind: Type): int = - ## Small optimization function that reuses the same constant - ## if it's already been written before (only if self.reuseConsts - ## equals true) - if not self.reuseConsts: - return - for i, c in self.consts: - # We cannot use simple equality because the nodes likely have - # different token objects with different values - if c.kind != constant.kind: - continue - if constant.isConst(): - if LiteralExpr(c).literal.lexeme == LiteralExpr( - constant).literal.lexeme: - # This wouldn't work for stuff like 2e3 and 2000.0, but those - # forms are collapsed in the compiler before being written - # to the constants table - return i - elif constant.kind == identExpr: - if IdentExpr(c).name.lexeme == IdentExpr(constant).name.lexeme: - return i - else: - continue - self.consts.add(constant) - result = self.consts.high() - - -proc addConstant*(self: Chunk, constant: Expression, kind: Type): array[3, uint8] = - ## Writes a constant of the given type in the chunk's constant - ## table. Returns its index as an array of 3 unsigned 8 bit integers. - ## Constant indexes are reused if a constant is used more than once - ## and self.reuseConsts equals true - if self.consts.high() == 16777215: - # The constant index is a 24 bit unsigned integer, so that's as far - # as we can index into the constant table (the same applies - # to our stack by the way). Not that anyone's ever gonna hit this - # limit in the real world, but you know, just in case - raise newException(CompileError, "cannot encode more than 16777216 constants") - result = self.findOrAddConstant(constant, kind).toTriple() +proc writeConstant*(self: Chunk, data: openarray[uint8]): array[3, uint8] = + ## Writes a series of bytes to the chunk's constant + ## table and returns the index of the first byte as + ## an array of 3 bytes + result = self.consts.len().toTriple() + for b in data: + self.consts.add(b) diff --git a/src/frontend/parser.nim b/src/frontend/parser.nim index 88aa304..ec64d87 100644 --- a/src/frontend/parser.nim +++ b/src/frontend/parser.nim @@ -547,8 +547,8 @@ proc blockStmt(self: Parser): Statement = var code: seq[Declaration] = @[] while not self.check(RightBrace) and not self.done(): code.add(self.declaration()) - if self.tree[^1] == nil: - self.tree.delete(self.tree.high()) + if code[^1] == nil: + code.delete(code.high()) self.expect(RightBrace, "expecting '}'") result = newBlockStmt(code, tok) self.endScope() @@ -1140,4 +1140,4 @@ proc parse*(self: Parser, tokens: seq[Token], file: string): seq[Declaration] = self.tree.add(self.declaration()) if self.tree[^1] == nil: self.tree.delete(self.tree.high()) - result = self.tree + result = self.tree \ No newline at end of file diff --git a/src/test.nim b/src/test.nim index 8a5449a..6b48e04 100644 --- a/src/test.nim +++ b/src/test.nim @@ -23,11 +23,11 @@ proc fillSymbolTable(tokenizer: Lexer) proc getLineEditor: LineEditor # Handy dandy compile-time constants -const debugLexer = false +const debugLexer = true const debugParser = true const debugCompiler = true -const debugSerializer = false -const debugRuntime = false +const debugSerializer = true +const debugRuntime = true when debugSerializer: import nimSHA2 @@ -113,7 +113,7 @@ when isMainModule: stdout.write(e) if i < len(serialized.chunk.consts) - 1: stdout.write(", ") - stdout.write("]\n") + stdout.write(&"] (matches: {serialized.chunk.consts == compiled.consts})\n") stdout.write(&"\t- Reconstructed bytecode: [") for i, e in serialized.chunk.code: stdout.write($e) @@ -175,7 +175,7 @@ proc fillSymbolTable(tokenizer: Lexer) = tokenizer.symbols.addKeyword("case", Case) tokenizer.symbols.addKeyword("operator", Operator) tokenizer.symbols.addKeyword("generator", Generator) - tokenizer.symbols.addKeyword("function", TokenType.Function) + tokenizer.symbols.addKeyword("fn", TokenType.Function) tokenizer.symbols.addKeyword("coroutine", Coroutine) tokenizer.symbols.addKeyword("break", TokenType.Break) tokenizer.symbols.addKeyword("continue", Continue) diff --git a/src/util/debugger.nim b/src/util/debugger.nim index d95953d..3642739 100644 --- a/src/util/debugger.nim +++ b/src/util/debugger.nim @@ -13,7 +13,6 @@ # limitations under the License. import ../frontend/meta/bytecode -import ../frontend/meta/ast import multibyte @@ -104,9 +103,6 @@ proc constantInstruction(instruction: OpCode, chunk: Chunk, offset: int): int = setForegroundColor(fgYellow) stdout.write(&"{obj}\n") setForegroundColor(fgGreen) - printDebug("Value kind: ") - setForegroundColor(fgYellow) - stdout.write(&"{obj.kind}\n") return offset + 4 diff --git a/src/util/multibyte.nim b/src/util/multibyte.nim index cd63a31..4244cf5 100644 --- a/src/util/multibyte.nim +++ b/src/util/multibyte.nim @@ -17,17 +17,26 @@ proc toDouble*(input: int | uint | uint16): array[2, uint8] = - ## Converts an int (either int, uint or uint16) + ## Converts an unsigned integer ## to an array[2, uint8] result = cast[array[2, uint8]](uint16(input)) proc toTriple*(input: uint | int): array[3, uint8] = - ## Converts an unsigned integer (int is converted - ## to an uint and sign is lost!) to an array[3, uint8] + ## Converts an unsigned integer to an array[3, uint8] result = cast[array[3, uint8]](uint(input)) +proc toQuad*(input: int | uint | uint16 | uint32): array[4, uint8] = + ## Converts an unsigned integer to an array[4, uint8] + result = cast[array[4, uint8]](uint(input)) + + +proc toLong*(input: int | uint | uint16 | uint32 | uint64): array[8, uint8] = + ## Converts an unsigned integer to an array[8, uint8] + result = cast[array[8, uint8]](uint(input)) + + proc fromDouble*(input: array[2, uint8]): uint16 = ## Rebuilds the output of toDouble into ## an uint16 @@ -38,3 +47,15 @@ proc fromTriple*(input: array[3, uint8]): uint = ## Rebuilds the output of toTriple into ## an uint copyMem(result.addr, unsafeAddr(input), sizeof(uint8) * 3) + + +proc fromQuad*(input: array[4, uint8]): uint = + ## Rebuilts the output of toQuad into + ## an uint + copyMem(result.addr, unsafeAddr(input), sizeof(uint32)) + + +proc fromLong*(input: array[8, uint8]): uint = + ## Rebuilts the output of toQuad into + ## an uint + copyMem(result.addr, unsafeAddr(input), sizeof(uint64)) \ No newline at end of file diff --git a/src/util/serializer.nim b/src/util/serializer.nim index f2cb08f..54ca97b 100644 --- a/src/util/serializer.nim +++ b/src/util/serializer.nim @@ -11,10 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import ../frontend/meta/ast import ../frontend/meta/errors import ../frontend/meta/bytecode -import ../frontend/meta/token import ../config import multibyte import ../frontend/compiler @@ -85,14 +83,6 @@ proc bytesToString(self: Serializer, input: seq[byte]): string = result.add(char(b)) -proc bytesToInt(self: Serializer, input: array[8, byte]): int = - copyMem(result.addr, input.unsafeAddr, sizeof(int)) - - -proc bytesToInt(self: Serializer, input: array[3, byte]): int = - copyMem(result.addr, input.unsafeAddr, sizeof(byte) * 3) - - proc extend[T](s: var seq[T], a: openarray[T]) = ## Extends s with the elements of a for e in a: @@ -107,122 +97,65 @@ proc writeHeaders(self: Serializer, stream: var seq[byte], file: string) = stream.add(byte(PEON_VERSION.patch)) stream.add(byte(len(PEON_BRANCH))) stream.extend(self.toBytes(PEON_BRANCH)) - if len(PEON_COMMIT_HASH) != 40: - self.error("the commit hash must be exactly 40 characters long") stream.extend(self.toBytes(PEON_COMMIT_HASH)) stream.extend(self.toBytes(getTime().toUnixFloat().int())) stream.extend(self.toBytes(computeSHA256(file))) proc writeConstants(self: Serializer, stream: var seq[byte]) = - ## Writes the constants table in-place into the given stream + ## Writes the constants table in-place into the + ## given stream + stream.extend(self.chunk.consts.len().toQuad()) for constant in self.chunk.consts: - case constant.kind: - of intExpr, floatExpr: - stream.add(0x1) - stream.extend(len(constant.token.lexeme).toTriple()) - stream.extend(self.toBytes(constant.token.lexeme)) - of strExpr: - stream.add(0x2) - var temp: byte - var strip: int = 2 - var offset: int = 1 - case constant.token.lexeme[0]: - of 'f': - strip = 3 - inc(offset) - temp = 0x2 - of 'b': - strip = 3 - inc(offset) - temp = 0x1 - else: - strip = 2 - temp = 0x0 - stream.extend((len(constant.token.lexeme) - strip).toTriple()) # Removes the quotes from the length count as they're not written - stream.add(temp) - stream.add(self.toBytes(constant.token.lexeme[offset..^2])) - of identExpr: - stream.add(0x0) - stream.extend(len(constant.token.lexeme).toTriple()) - stream.add(self.toBytes(constant.token.lexeme)) - else: - self.error(&"unknown constant kind in chunk table ({constant.kind})") - stream.add(0x59) # End marker - - -proc readConstants(self: Serializer, stream: seq[byte]): int = - ## Reads the constant table from the given stream and - ## adds each constant to the chunk object. - ## Returns the number of bytes that were processed in - ## the stream - var stream = stream - var count: int = 0 - while true: - case stream[0]: - of 0x59: - inc(count) - break - of 0x2: - stream = stream[1..^1] - let size = self.bytesToInt([stream[0], stream[1], stream[2]]) - stream = stream[3..^1] - var s = newStrExpr(Token(lexeme: "")) - case stream[0]: - of 0x0: - discard - of 0x1: - s.token.lexeme.add("b") - of 0x2: - s.token.lexeme.add("f") - else: - self.error(&"unknown string modifier in chunk table (0x{stream[0].toHex()}") - stream = stream[1..^1] - s.token.lexeme.add("\"") - for i in countup(0, size - 1): - s.token.lexeme.add(cast[char](stream[i])) - s.token.lexeme.add("\"") - stream = stream[size..^1] - self.chunk.consts.add(s) - inc(count, size + 5) - of 0x1: - stream = stream[1..^1] - inc(count) - let size = self.bytesToInt([stream[0], stream[1], stream[2]]) - stream = stream[3..^1] - inc(count, 3) - var tok: Token = new(Token) - tok.lexeme = self.bytesToString(stream[0..