better unicode support

input string now support full range fixed length escaped unicode char and variable length escaped unicode char along with UTF-8 encoding. output string also support full range fixed length escaped unicode char for json string. full range unicode means it will use surrogate pair if necessary. fixes #50
status-im · Oct 31, 2021 · e7848a0 · e7848a0
1 parent a8af6b7
commit e7848a0
Show file tree

Hide file tree

Showing 6 changed files with 918 additions and 431 deletions.
diff --git a/docs/toc.md b/docs/toc.md
@@ -3,6 +3,7 @@
 - [Overview](#overview)
   - [Graphql core](#graphql-core)
   - [Security features](#security-features)
+  - [Unicode support](#unicode-support)
 
 - [Tutorial](tutorial.md)
   - [Important notes](tutorial.md#important-notes)
@@ -83,3 +84,18 @@ can bring down the service, both lexer and parser are configurable to mitigate t
     - `maxDefinitions`. Queries, mutations, subscriptions, and fragments total number should be reasonable. (default = 512)
     - `maxChoices`. Unions and directive's locations are limited by this number. (default = 64)
 
+### Unicode support
+
+  - Input string:
+    - Accepted encoding for input string are UTF-8.
+    - Escaped unicode in quoted string take the form of UTF-16 BE:
+      - Fixed 4 digit hex: e.g. `\u000A`
+      - variable length: `\u{1F4A9}` with range (>= 0x0000 and <= 0xD7FF or >= 0xE000 and <= 0x10FFFF)
+      - Escape sequences are only meaningful within a single-quoted string.
+        In multiline string, unicode char must be encoded using UTF-8.
+      - SurrogatePair: "\uD83D\uDCA9" is equal to "\u{1F4A9}"
+
+  - Output string:
+    - Output string subject to output serialization format specification.
+    - For example, output using json as serialization format will result in UTF-8 encoded string.
+    - Or if the escape flag is set, it will use UTF-16 BE 4 digit hex fixed length similar to GraphQL escape sequence.
diff --git a/graphql/builtin/json_respstream.nim b/graphql/builtin/json_respstream.nim
@@ -9,7 +9,8 @@
 
 import
   faststreams/[outputs, textio],
-  ../common/[respstream, ast]
+  ../common/[respstream, ast],
+  ../private/utf
 
 export respstream
 
@@ -25,6 +26,7 @@ type
     stream: OutputStream
     stack: seq[State]
     doubleEscape: bool
+    escapeUnicode: bool
 
 template top(x: seq[State]): State =
   x[^1]
@@ -75,7 +77,7 @@ proc write*(x: JsonRespStream, v: string) =
     append '\\'
     append c
 
-  for c in v:
+  template writeChar(c: char) =
     case c
     of '\L': addPrefixSlash 'n'
     of '\b': addPrefixSlash 'b'
@@ -96,6 +98,25 @@ proc write*(x: JsonRespStream, v: string) =
     of '\\': addPrefixSlash '\\'
     else: append c
 
+  if x.escapeUnicode:
+    for c in Utf8.codePoints(v):
+      if c >= 0x80:
+        let p = Utf16.toPair(c)
+        if p.state == Utf16One:
+          append "\\u"
+          x.stream.writeHex([char(p.cp shr 8), char(p.cp and 0xFF)])
+        elif p.state == Utf16Two:
+          append "\\u"
+          x.stream.writeHex([char(p.hi shr 8), char(p.hi and 0xFF)])
+          append "\\u"
+          x.stream.writeHex([char(p.lo shr 8), char(p.lo and 0xFF)])
+      else:
+        let cc = c.char
+        writeChar(cc)
+  else:
+    for c in v:
+      writeChar(c)
+
   if x.doubleEscape:
     append "\\\""
   else:
@@ -198,12 +219,17 @@ proc getBytes*(x: JsonRespStream): seq[byte] =
 proc len*(x: JsonRespStream): int =
   x.stream.pos()
 
-proc init*(v: JsonRespStream, doubleEscape: bool = false) =
+proc init*(v: JsonRespStream,
+           doubleEscape: bool = false,
+           escapeUnicode: bool = false) =
   v.stream = memoryOutput()
   v.stack  = @[StateTop]
   v.doubleEscape = doubleEscape
+  v.escapeUnicode = escapeUnicode
 
-proc new*(_: type JsonRespStream, doubleEscape: bool = false): JsonRespStream =
+proc new*(_: type JsonRespStream,
+          doubleEscape: bool = false,
+          escapeUnicode: bool = false): JsonRespStream =
   let v = JsonRespStream()
-  v.init(doubleEscape)
+  v.init(doubleEscape, escapeUnicode)
   v
diff --git a/graphql/lexer.nim b/graphql/lexer.nim
@@ -10,7 +10,8 @@
 import
   std/[unicode, strutils],
   faststreams/inputs,
-  ./common/[names, errors, types]
+  ./common/[names, errors, types],
+  ./private/utf
 
 type
   TokKind* = enum
@@ -48,11 +49,17 @@ type
     errInvalidUnicode      = "Invalid unicode sequence '$1'"
     errInvalidChar         = "Invalid char '$1'"
     errLoopLimit           = "loop limit $1 reached for $2"
+    errInvalidUTF8         = "Invalid UTF-8 sequence detected in string"
+    errOrphanSurrogate     = "Orphaned surrogate codepoint detected '$1'"
+
+  LexerFlag* = enum
+    lfJsonCompatibility # parse json unicode escape chars but not graphql escape chars
 
   LexConf* = object
     maxIdentChars* : int
     maxDigits*     : int
     maxStringChars*: int
+    flags*         : set[LexerFlag]
 
   LexConfInternal = object
     maxIdentChars : LoopGuard
@@ -72,6 +79,7 @@ type
     error*     : LexerError
     err*       : ErrorDesc
     conf       : LexConfInternal
+    flags*     : set[LexerFlag]
 
 proc defaultLexConf*(): LexConf =
   result.maxIdentChars  = 128
@@ -88,7 +96,8 @@ proc init*(T: type Lexer, stream: InputStream, names: NameCache, conf = defaultL
     stream: stream,
     names: names,
     line: 1,
-    conf: toInternalConf(conf)
+    conf: toInternalConf(conf),
+    flags: conf.flags
   )
 
 template peek(s: InputStream): char =
@@ -121,7 +130,7 @@ proc lexerError(lex: var Lexer, errKind: LexerError, args: varargs[string, `$`])
   lex.err.message = $errKind
 
   case errKind
-  of errInvalidEscape, errInvalidUnicode, errInvalidChar:
+  of errInvalidEscape, errInvalidUnicode, errInvalidChar, errOrphanSurrogate:
     lex.err.message = $errKind % [args[0]]
   of errLoopLimit:
     lex.err.message = $errKind % [args[0], args[1]]
@@ -276,19 +285,81 @@ func charTo(T: type, c: char): T {.inline.} =
   of {'A'..'F'}: result = T(c) - T('A') + T(10)
   else: doAssert(false, "should never executed")
 
-proc scanHexDigits(lex: var Lexer, value: var int): int =
+proc scanHexDigits(lex: var Lexer, value: var int, token: var string): int =
   safeLoop(lex.conf.maxDigits, lex.safePeek HexDigits):
     inc result
-    value = value * 16 + charTo(int, lex.stream.read)
+    let c = lex.stream.read
+    value = value * 16 + charTo(int, c)
+    token.add c
+
+proc invalidEscapeChar(lex: var Lexer) =
+  if not lex.stream.readable:
+    lex.lexerError(errInvalidEscape, tokEof)
+  else:
+    lex.lexerError(errInvalidEscape, lex.stream.peek)
 
 proc scanUnicode(lex: var Lexer): bool =
-  var code: int
-  if lex.scanHexDigits(code) != 4:
-    lex.lexerError(errInvalidUnicode, code)
-    return false
+  if lex.safePeek HexDigits:
+    var codePoint: int
+    var token: string
+    if lex.scanHexDigits(codePoint, token) != 4:
+      lex.lexerError(errInvalidUnicode, token)
+      return false
 
-  lex.token.add unicode.toUTF8(Rune(code))
-  return true
+    if Utf16.highSurrogate(codePoint):
+      if not lex.safePeek '\\':
+        lex.lexerError(errOrphanSurrogate, token)
+        return false
+      advance lex.stream
+
+      if not lex.safePeek 'u':
+        lex.lexerError(errOrphanSurrogate, token)
+        return false
+      advance lex.stream
+
+      var surrogate: int
+      var hexSurrogate: string
+      if lex.scanHexDigits(surrogate, hexSurrogate) != 4:
+        lex.lexerError(errInvalidUnicode, hexSurrogate)
+        return false
+
+      codePoint = Utf16.utf(codePoint, surrogate)
+      token.add "\\u"
+      token.add hexSurrogate
+
+    if not Utf8.append(lex.token, codePoint):
+      lex.lexerError(errInvalidUnicode, token)
+      return false
+
+    return true
+
+  elif lex.safePeek '{':
+    if lfJsonCompatibility in lex.flags:
+      lex.lexerError(errInvalidEscape, '{')
+      return false
+
+    advance lex.stream # eat '{'
+
+    var codePoint: int
+    var token: string
+    if lex.scanHexDigits(codePoint, token) > 6:
+      lex.lexerError(errInvalidUnicode, token)
+      return false
+
+    if not Utf8.append(lex.token, codePoint):
+      lex.lexerError(errInvalidUnicode, token)
+      return false
+
+    if not lex.safePeek '}':
+      lex.invalidEscapeChar
+      return false
+
+    advance lex.stream # eat '}'
+    return true
+
+  else:
+    lex.invalidEscapeChar
+    return false
 
 proc scanEscapeChar(lex: var Lexer): bool =
   if not lex.stream.readable:
@@ -349,6 +420,8 @@ proc scanMultiLineString(lex: var Lexer) =
             lex.token.setLen(lex.token.len-1)
             lex.token.add "\"\"\"" # Escape Triple-Quote (\""")
           else:
+            if Utf8.validate(lex.token) == false:
+              lex.lexerError(errInvalidUTF8)
             return
         else:
           lex.token.add '"'
@@ -368,7 +441,6 @@ proc scanMultiLineString(lex: var Lexer) =
     of '\\':
       lex.token.add lex.stream.read
     else:
-      # FIXME: this is not a valid UTF-16 lexer
       lex.token.add lex.stream.read
 
   lex.lexerError(errUnterminatedBlockString)
@@ -382,14 +454,15 @@ proc scanSingleLineString(lex: var Lexer) =
       return
     of '"':
       advance lex.stream
+      if Utf8.validate(lex.token) == false:
+        lex.lexerError(errInvalidUTF8)
       return
     of '\\':
       advance lex.stream
       if not lex.scanEscapeChar():
         return
       continue
     else:
-      # FIXME: this is not a valid UTF-16 lexer
       lex.token.add lex.stream.read
 
   lex.lexerError(errUnterminatedString)