utf8 support

This commit is contained in:
prod2 2022-02-08 10:10:07 +01:00
parent ece31b11e0
commit a16026b364
3 changed files with 86 additions and 25 deletions

View File

@ -1,6 +1,7 @@
import strutils
import tables
import strformat
import unicode
type
Scanner* = ref object
@ -33,23 +34,28 @@ proc debugPrint*(token: Token) =
proc isAtEnd(scanner: Scanner): bool =
scanner.current > scanner.source.high
proc advance(scanner: Scanner): char =
scanner.current.inc
scanner.source[scanner.current - 1]
proc advance(scanner: Scanner): Rune =
scanner.source.fastRuneAt(scanner.current, result, doInc = true)
proc peek(scanner: Scanner): char =
proc peek(scanner: Scanner): Rune =
if scanner.isAtEnd():
'\0'
return "\0".runeAt(0)
else:
scanner.source[scanner.current]
scanner.source.fastRuneAt(scanner.current, result, doInc = false)
proc peekNext(scanner: Scanner): char =
proc peekNext(scanner: Scanner): Rune =
if scanner.current < scanner.source.high:
scanner.source[scanner.current + 1]
scanner.source.fastRuneAt(scanner.current + 1, result, doInc = false)
else:
'\0'
return "\0".runeAt(0)
proc match(scanner: Scanner, exp: char): bool =
template `==`(l: char, r: Rune): bool =
($l).runeAt(0) == r
template `==`(l: Rune, r: char): bool =
($r).runeAt(0) == l
proc match(scanner: Scanner, exp: char | Rune): bool =
if scanner.peek() == exp:
discard scanner.advance()
true
@ -69,9 +75,18 @@ proc errorToken(scanner: Scanner, msg: string): Token =
result.text = msg
result.line = scanner.line
proc toChar(r: Rune): char =
## use only for matching runes in case statements
if r.size() > 1:
char(255) # never match this
else:
($r)[0]
proc skipWhitespace(scanner: Scanner) =
while true:
let c = scanner.peek()
let c = scanner.peek().toChar()
case c:
of {' ', '\r', '\t'}:
discard scanner.advance()
@ -80,15 +95,29 @@ proc skipWhitespace(scanner: Scanner) =
discard scanner.advance()
of '/':
if scanner.peekNext() == '/':
while not scanner.isAtEnd() and scanner.peek != '\n' :
while not scanner.isAtEnd() and scanner.peek().toChar() != '\n' :
discard scanner.advance()
elif scanner.peekNext() == '*':
var depth = 1
while not scanner.isAtEnd():
discard scanner.advance()
if scanner.peek().toChar() == '/' and scanner.peekNext().toChar() == '*':
depth.inc
discard scanner.advance()
discard scanner.advance()
if scanner.peek().toChar() == '*' and scanner.peekNext().toChar() == '/':
depth.dec
discard scanner.advance()
discard scanner.advance()
if depth == 0:
break
else:
return
else:
return
proc scanString(scanner: Scanner): Token =
while not scanner.isAtEnd() and scanner.peek() != '\"' :
while not scanner.isAtEnd() and scanner.peek().toChar() != '\"' :
if scanner.peek() == '\n':
scanner.line.inc
discard scanner.advance()
@ -100,12 +129,12 @@ proc scanString(scanner: Scanner): Token =
scanner.makeToken(tkString)
proc scanNumber(scanner: Scanner): Token =
while scanner.peek() in Digits:
while scanner.peek().toChar() in Digits:
discard scanner.advance()
if scanner.peek() == '.' and scanner.peekNext() in Digits:
if scanner.peek().toChar() == '.' and scanner.peekNext().toChar() in Digits:
discard scanner.advance()
while scanner.peek() in Digits:
while scanner.peek().toChar() in Digits:
discard scanner.advance()
return scanner.makeToken(tkNumber)
@ -127,14 +156,14 @@ const keywords = {
"while": tkWhile,
}.toTable
proc canStartIdent(chr: char): bool =
chr in Letters or chr in {'_'}
proc canStartIdent(chr: Rune): bool =
chr.isAlpha() or chr.toChar() == '_'
proc canContIdent(chr: char): bool =
canStartIdent(chr) or chr in Digits
proc canContIdent(chr: Rune): bool =
canStartIdent(chr) or chr.toChar() in Digits
proc scanIdentifier(scanner: Scanner): Token =
while scanner.peek.canContIdent:
while scanner.peek().canContIdent():
discard scanner.advance()
let text = scanner.source[scanner.start..scanner.current-1]
@ -144,8 +173,8 @@ proc scanIdentifier(scanner: Scanner): Token =
return scanner.makeToken(tkIdentifier)
proc canContLabel(chr: char): bool =
chr in Letters or chr == '_'
proc canContLabel(chr: Rune): bool =
chr.isAlpha() or chr.toChar() == '_'
proc scanLabel(scanner: Scanner): Token =
if not scanner.peek.canContLabel:
@ -164,7 +193,8 @@ proc scanToken*(scanner: Scanner): Token =
if scanner.isAtEnd():
return scanner.makeToken(tkEof)
let c = scanner.advance()
let rune = scanner.advance()
let c = rune.toChar()
case c:
of '(': return scanner.makeToken(tkLeftParen)
@ -207,7 +237,7 @@ proc scanToken*(scanner: Scanner): Token =
elif scanner.peek().canContIdent(): return scanner.scanIdentifier()
else: return scanner.makeToken(tkColon)
else:
if c.canStartIdent():
if rune.canStartIdent():
# ':' can start ident, but is not handled here
return scanner.scanIdentifier()
else:

30
tests/scanner.nds Normal file
View File

@ -0,0 +1,30 @@
// nice comments
/*
a multiline comment
/*
with nested multiline comments
/*/
this doesn't break it
*/
*/
*/
// some utf8 letters in idents
var áéíóú = 5;
print (áéíóú);
//expect:5.0
{ @å
print ("before");
{
:å = "result";
break @å;
// this convolution needed because breaks detect code after them and error
};
print ("after");
} :: print;
//expect:before
//expect:result

View File

@ -1,5 +1,6 @@
// testing syntactic sugars
// :: piping function call
var double = funct(num) :result = num * 2;