Skip to content

Commit

Permalink
better unicode support
Browse files Browse the repository at this point in the history
input string now support full range fixed length escaped unicode char
and variable length escaped unicode char along with UTF-8 encoding.

output string also support full range fixed length escaped unicode char
for json string.

full range unicode means it will use surrogate pair if necessary.

fixes #50
  • Loading branch information
jangko committed Oct 31, 2021
1 parent a8af6b7 commit e7848a0
Show file tree
Hide file tree
Showing 6 changed files with 918 additions and 431 deletions.
16 changes: 16 additions & 0 deletions docs/toc.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
- [Overview](#overview)
- [Graphql core](#graphql-core)
- [Security features](#security-features)
- [Unicode support](#unicode-support)

- [Tutorial](tutorial.md)
- [Important notes](tutorial.md#important-notes)
Expand Down Expand Up @@ -83,3 +84,18 @@ can bring down the service, both lexer and parser are configurable to mitigate t
- `maxDefinitions`. Queries, mutations, subscriptions, and fragments total number should be reasonable. (default = 512)
- `maxChoices`. Unions and directive's locations are limited by this number. (default = 64)

### Unicode support

- Input string:
- Accepted encoding for input string are UTF-8.
- Escaped unicode in quoted string take the form of UTF-16 BE:
- Fixed 4 digit hex: e.g. `\u000A`
- variable length: `\u{1F4A9}` with range (>= 0x0000 and <= 0xD7FF or >= 0xE000 and <= 0x10FFFF)
- Escape sequences are only meaningful within a single-quoted string.
In multiline string, unicode char must be encoded using UTF-8.
- SurrogatePair: "\uD83D\uDCA9" is equal to "\u{1F4A9}"

- Output string:
- Output string subject to output serialization format specification.
- For example, output using json as serialization format will result in UTF-8 encoded string.
- Or if the escape flag is set, it will use UTF-16 BE 4 digit hex fixed length similar to GraphQL escape sequence.
36 changes: 31 additions & 5 deletions graphql/builtin/json_respstream.nim
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@

import
faststreams/[outputs, textio],
../common/[respstream, ast]
../common/[respstream, ast],
../private/utf

export respstream

Expand All @@ -25,6 +26,7 @@ type
stream: OutputStream
stack: seq[State]
doubleEscape: bool
escapeUnicode: bool

template top(x: seq[State]): State =
x[^1]
Expand Down Expand Up @@ -75,7 +77,7 @@ proc write*(x: JsonRespStream, v: string) =
append '\\'
append c

for c in v:
template writeChar(c: char) =
case c
of '\L': addPrefixSlash 'n'
of '\b': addPrefixSlash 'b'
Expand All @@ -96,6 +98,25 @@ proc write*(x: JsonRespStream, v: string) =
of '\\': addPrefixSlash '\\'
else: append c

if x.escapeUnicode:
for c in Utf8.codePoints(v):
if c >= 0x80:
let p = Utf16.toPair(c)
if p.state == Utf16One:
append "\\u"
x.stream.writeHex([char(p.cp shr 8), char(p.cp and 0xFF)])
elif p.state == Utf16Two:
append "\\u"
x.stream.writeHex([char(p.hi shr 8), char(p.hi and 0xFF)])
append "\\u"
x.stream.writeHex([char(p.lo shr 8), char(p.lo and 0xFF)])
else:
let cc = c.char
writeChar(cc)
else:
for c in v:
writeChar(c)

if x.doubleEscape:
append "\\\""
else:
Expand Down Expand Up @@ -198,12 +219,17 @@ proc getBytes*(x: JsonRespStream): seq[byte] =
proc len*(x: JsonRespStream): int =
x.stream.pos()

proc init*(v: JsonRespStream, doubleEscape: bool = false) =
proc init*(v: JsonRespStream,
doubleEscape: bool = false,
escapeUnicode: bool = false) =
v.stream = memoryOutput()
v.stack = @[StateTop]
v.doubleEscape = doubleEscape
v.escapeUnicode = escapeUnicode

proc new*(_: type JsonRespStream, doubleEscape: bool = false): JsonRespStream =
proc new*(_: type JsonRespStream,
doubleEscape: bool = false,
escapeUnicode: bool = false): JsonRespStream =
let v = JsonRespStream()
v.init(doubleEscape)
v.init(doubleEscape, escapeUnicode)
v
99 changes: 86 additions & 13 deletions graphql/lexer.nim
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@
import
std/[unicode, strutils],
faststreams/inputs,
./common/[names, errors, types]
./common/[names, errors, types],
./private/utf

type
TokKind* = enum
Expand Down Expand Up @@ -48,11 +49,17 @@ type
errInvalidUnicode = "Invalid unicode sequence '$1'"
errInvalidChar = "Invalid char '$1'"
errLoopLimit = "loop limit $1 reached for $2"
errInvalidUTF8 = "Invalid UTF-8 sequence detected in string"
errOrphanSurrogate = "Orphaned surrogate codepoint detected '$1'"

LexerFlag* = enum
lfJsonCompatibility # parse json unicode escape chars but not graphql escape chars

LexConf* = object
maxIdentChars* : int
maxDigits* : int
maxStringChars*: int
flags* : set[LexerFlag]

LexConfInternal = object
maxIdentChars : LoopGuard
Expand All @@ -72,6 +79,7 @@ type
error* : LexerError
err* : ErrorDesc
conf : LexConfInternal
flags* : set[LexerFlag]

proc defaultLexConf*(): LexConf =
result.maxIdentChars = 128
Expand All @@ -88,7 +96,8 @@ proc init*(T: type Lexer, stream: InputStream, names: NameCache, conf = defaultL
stream: stream,
names: names,
line: 1,
conf: toInternalConf(conf)
conf: toInternalConf(conf),
flags: conf.flags
)

template peek(s: InputStream): char =
Expand Down Expand Up @@ -121,7 +130,7 @@ proc lexerError(lex: var Lexer, errKind: LexerError, args: varargs[string, `$`])
lex.err.message = $errKind

case errKind
of errInvalidEscape, errInvalidUnicode, errInvalidChar:
of errInvalidEscape, errInvalidUnicode, errInvalidChar, errOrphanSurrogate:
lex.err.message = $errKind % [args[0]]
of errLoopLimit:
lex.err.message = $errKind % [args[0], args[1]]
Expand Down Expand Up @@ -276,19 +285,81 @@ func charTo(T: type, c: char): T {.inline.} =
of {'A'..'F'}: result = T(c) - T('A') + T(10)
else: doAssert(false, "should never executed")

proc scanHexDigits(lex: var Lexer, value: var int): int =
proc scanHexDigits(lex: var Lexer, value: var int, token: var string): int =
safeLoop(lex.conf.maxDigits, lex.safePeek HexDigits):
inc result
value = value * 16 + charTo(int, lex.stream.read)
let c = lex.stream.read
value = value * 16 + charTo(int, c)
token.add c

proc invalidEscapeChar(lex: var Lexer) =
if not lex.stream.readable:
lex.lexerError(errInvalidEscape, tokEof)
else:
lex.lexerError(errInvalidEscape, lex.stream.peek)

proc scanUnicode(lex: var Lexer): bool =
var code: int
if lex.scanHexDigits(code) != 4:
lex.lexerError(errInvalidUnicode, code)
return false
if lex.safePeek HexDigits:
var codePoint: int
var token: string
if lex.scanHexDigits(codePoint, token) != 4:
lex.lexerError(errInvalidUnicode, token)
return false

lex.token.add unicode.toUTF8(Rune(code))
return true
if Utf16.highSurrogate(codePoint):
if not lex.safePeek '\\':
lex.lexerError(errOrphanSurrogate, token)
return false
advance lex.stream

if not lex.safePeek 'u':
lex.lexerError(errOrphanSurrogate, token)
return false
advance lex.stream

var surrogate: int
var hexSurrogate: string
if lex.scanHexDigits(surrogate, hexSurrogate) != 4:
lex.lexerError(errInvalidUnicode, hexSurrogate)
return false

codePoint = Utf16.utf(codePoint, surrogate)
token.add "\\u"
token.add hexSurrogate

if not Utf8.append(lex.token, codePoint):
lex.lexerError(errInvalidUnicode, token)
return false

return true

elif lex.safePeek '{':
if lfJsonCompatibility in lex.flags:
lex.lexerError(errInvalidEscape, '{')
return false

advance lex.stream # eat '{'

var codePoint: int
var token: string
if lex.scanHexDigits(codePoint, token) > 6:
lex.lexerError(errInvalidUnicode, token)
return false

if not Utf8.append(lex.token, codePoint):
lex.lexerError(errInvalidUnicode, token)
return false

if not lex.safePeek '}':
lex.invalidEscapeChar
return false

advance lex.stream # eat '}'
return true

else:
lex.invalidEscapeChar
return false

proc scanEscapeChar(lex: var Lexer): bool =
if not lex.stream.readable:
Expand Down Expand Up @@ -349,6 +420,8 @@ proc scanMultiLineString(lex: var Lexer) =
lex.token.setLen(lex.token.len-1)
lex.token.add "\"\"\"" # Escape Triple-Quote (\""")
else:
if Utf8.validate(lex.token) == false:
lex.lexerError(errInvalidUTF8)
return
else:
lex.token.add '"'
Expand All @@ -368,7 +441,6 @@ proc scanMultiLineString(lex: var Lexer) =
of '\\':
lex.token.add lex.stream.read
else:
# FIXME: this is not a valid UTF-16 lexer
lex.token.add lex.stream.read

lex.lexerError(errUnterminatedBlockString)
Expand All @@ -382,14 +454,15 @@ proc scanSingleLineString(lex: var Lexer) =
return
of '"':
advance lex.stream
if Utf8.validate(lex.token) == false:
lex.lexerError(errInvalidUTF8)
return
of '\\':
advance lex.stream
if not lex.scanEscapeChar():
return
continue
else:
# FIXME: this is not a valid UTF-16 lexer
lex.token.add lex.stream.read

lex.lexerError(errUnterminatedString)
Expand Down

0 comments on commit e7848a0

Please sign in to comment.