From a16026b364fb4e67bd794a11ad57e9b5e31ba1e6 Mon Sep 17 00:00:00 2001 From: prod2 <95874442+prod2@users.noreply.github.com> Date: Tue, 8 Feb 2022 10:10:07 +0100 Subject: [PATCH] utf8 support --- src/ndspkg/scanner.nim | 80 +++++++++++++++++++++++++++++------------- tests/scanner.nds | 30 ++++++++++++++++ tests/sugar.nds | 1 + 3 files changed, 86 insertions(+), 25 deletions(-) create mode 100644 tests/scanner.nds diff --git a/src/ndspkg/scanner.nim b/src/ndspkg/scanner.nim index 537ffd2..3bac4d9 100644 --- a/src/ndspkg/scanner.nim +++ b/src/ndspkg/scanner.nim @@ -1,6 +1,7 @@ import strutils import tables import strformat +import unicode type Scanner* = ref object @@ -33,23 +34,28 @@ proc debugPrint*(token: Token) = proc isAtEnd(scanner: Scanner): bool = scanner.current > scanner.source.high -proc advance(scanner: Scanner): char = - scanner.current.inc - scanner.source[scanner.current - 1] +proc advance(scanner: Scanner): Rune = + scanner.source.fastRuneAt(scanner.current, result, doInc = true) -proc peek(scanner: Scanner): char = +proc peek(scanner: Scanner): Rune = if scanner.isAtEnd(): - '\0' + return "\0".runeAt(0) else: - scanner.source[scanner.current] + scanner.source.fastRuneAt(scanner.current, result, doInc = false) -proc peekNext(scanner: Scanner): char = +proc peekNext(scanner: Scanner): Rune = if scanner.current < scanner.source.high: - scanner.source[scanner.current + 1] + scanner.source.fastRuneAt(scanner.current + 1, result, doInc = false) else: - '\0' + return "\0".runeAt(0) -proc match(scanner: Scanner, exp: char): bool = +template `==`(l: char, r: Rune): bool = + ($l).runeAt(0) == r + +template `==`(l: Rune, r: char): bool = + ($r).runeAt(0) == l + +proc match(scanner: Scanner, exp: char | Rune): bool = if scanner.peek() == exp: discard scanner.advance() true @@ -69,9 +75,18 @@ proc errorToken(scanner: Scanner, msg: string): Token = result.text = msg result.line = scanner.line + + +proc toChar(r: Rune): char = + ## use only for matching runes in case statements + if r.size() > 1: + char(255) # never match this + else: + ($r)[0] + proc skipWhitespace(scanner: Scanner) = while true: - let c = scanner.peek() + let c = scanner.peek().toChar() case c: of {' ', '\r', '\t'}: discard scanner.advance() @@ -80,15 +95,29 @@ proc skipWhitespace(scanner: Scanner) = discard scanner.advance() of '/': if scanner.peekNext() == '/': - while not scanner.isAtEnd() and scanner.peek != '\n' : + while not scanner.isAtEnd() and scanner.peek().toChar() != '\n' : discard scanner.advance() + elif scanner.peekNext() == '*': + var depth = 1 + while not scanner.isAtEnd(): + discard scanner.advance() + if scanner.peek().toChar() == '/' and scanner.peekNext().toChar() == '*': + depth.inc + discard scanner.advance() + discard scanner.advance() + if scanner.peek().toChar() == '*' and scanner.peekNext().toChar() == '/': + depth.dec + discard scanner.advance() + discard scanner.advance() + if depth == 0: + break else: return else: return proc scanString(scanner: Scanner): Token = - while not scanner.isAtEnd() and scanner.peek() != '\"' : + while not scanner.isAtEnd() and scanner.peek().toChar() != '\"' : if scanner.peek() == '\n': scanner.line.inc discard scanner.advance() @@ -100,12 +129,12 @@ proc scanString(scanner: Scanner): Token = scanner.makeToken(tkString) proc scanNumber(scanner: Scanner): Token = - while scanner.peek() in Digits: + while scanner.peek().toChar() in Digits: discard scanner.advance() - if scanner.peek() == '.' and scanner.peekNext() in Digits: + if scanner.peek().toChar() == '.' and scanner.peekNext().toChar() in Digits: discard scanner.advance() - while scanner.peek() in Digits: + while scanner.peek().toChar() in Digits: discard scanner.advance() return scanner.makeToken(tkNumber) @@ -127,14 +156,14 @@ const keywords = { "while": tkWhile, }.toTable -proc canStartIdent(chr: char): bool = - chr in Letters or chr in {'_'} +proc canStartIdent(chr: Rune): bool = + chr.isAlpha() or chr.toChar() == '_' -proc canContIdent(chr: char): bool = - canStartIdent(chr) or chr in Digits +proc canContIdent(chr: Rune): bool = + canStartIdent(chr) or chr.toChar() in Digits proc scanIdentifier(scanner: Scanner): Token = - while scanner.peek.canContIdent: + while scanner.peek().canContIdent(): discard scanner.advance() let text = scanner.source[scanner.start..scanner.current-1] @@ -144,8 +173,8 @@ proc scanIdentifier(scanner: Scanner): Token = return scanner.makeToken(tkIdentifier) -proc canContLabel(chr: char): bool = - chr in Letters or chr == '_' +proc canContLabel(chr: Rune): bool = + chr.isAlpha() or chr.toChar() == '_' proc scanLabel(scanner: Scanner): Token = if not scanner.peek.canContLabel: @@ -164,7 +193,8 @@ proc scanToken*(scanner: Scanner): Token = if scanner.isAtEnd(): return scanner.makeToken(tkEof) - let c = scanner.advance() + let rune = scanner.advance() + let c = rune.toChar() case c: of '(': return scanner.makeToken(tkLeftParen) @@ -207,7 +237,7 @@ proc scanToken*(scanner: Scanner): Token = elif scanner.peek().canContIdent(): return scanner.scanIdentifier() else: return scanner.makeToken(tkColon) else: - if c.canStartIdent(): + if rune.canStartIdent(): # ':' can start ident, but is not handled here return scanner.scanIdentifier() else: diff --git a/tests/scanner.nds b/tests/scanner.nds new file mode 100644 index 0000000..76e34d8 --- /dev/null +++ b/tests/scanner.nds @@ -0,0 +1,30 @@ + +// nice comments +/* + a multiline comment + /* + with nested multiline comments + /*/ + this doesn't break it + */ + */ +*/ + +// some utf8 letters in idents + +var áéíóú = 5; +print (áéíóú); +//expect:5.0 + +{ @å + print ("before"); + { + :å = "result"; + break @å; + // this convolution needed because breaks detect code after them and error + }; + print ("after"); +} :: print; + +//expect:before +//expect:result \ No newline at end of file diff --git a/tests/sugar.nds b/tests/sugar.nds index 7b3603d..71c14dc 100644 --- a/tests/sugar.nds +++ b/tests/sugar.nds @@ -1,5 +1,6 @@ // testing syntactic sugars + // :: piping function call var double = funct(num) :result = num * 2;