RFC: Support full Unicode in lexer

Depends on #3115 Implements RFC at graphql/graphql-spec#849. * Replaces `isSourceCharacter` with `isUnicodeScalarValue` * Adds `isSupplementaryCodePoint`, used in String, BlockStrings, and Comments to ensure correct lexing of JavaScript's UTF-16 source. * Updates `printCodePointAt` to correctly print supplementary code points. * Adds variable-width Unicode escape sequences * Adds explicit support for legacy JSON-style fixed-width Unicode escape sequence surrogate pairs. * Adds `printString` to no longer rely on `JSON.stringify`. Borrows some implementation details from Node.js internals for string printing. Implements: > When producing a {StringValue}, implementations should use escape sequences to > represent non-printable control characters (U+0000 to U+001F and U+007F to > U+009F). Other escape sequences are not necessary, however an implementation may > use escape sequences to represent any other range of code points. Closes #2449 Co-authored-by: Andreas Marek <andimarek@fastmail.fm>
graphql · May 19, 2021 · 601b4c2 · 601b4c2
1 parent 2cd8510
commit 601b4c2
Show file tree

Hide file tree

Showing 5 changed files with 453 additions and 47 deletions.
diff --git a/src/language/__tests__/lexer-test.js b/src/language/__tests__/lexer-test.js
@@ -27,13 +27,6 @@ function expectSyntaxError(text: string) {
 }
 
 describe('Lexer', () => {
-  it('disallows uncommon control characters', () => {
-    expectSyntaxError('\u0007').to.deep.equal({
-      message: 'Syntax Error: Invalid character: U+0007.',
-      locations: [{ line: 1, column: 1 }],
-    });
-  });
-
   it('ignores BOM header', () => {
     expect(lexOne('\uFEFF foo')).to.contain({
       kind: TokenKind.NAME,
@@ -263,12 +256,98 @@ describe('Lexer', () => {
       value: 'slashes \\ /',
     });
 
+    expect(lexOne('"unescaped unicode outside BMP \u{1f600}"')).to.contain({
+      kind: TokenKind.STRING,
+      start: 0,
+      end: 34,
+      value: 'unescaped unicode outside BMP \u{1f600}',
+    });
+
+    expect(
+      lexOne('"unescaped maximal unicode outside BMP \u{10ffff}"'),
+    ).to.contain({
+      kind: TokenKind.STRING,
+      start: 0,
+      end: 42,
+      value: 'unescaped maximal unicode outside BMP \u{10ffff}',
+    });
+
     expect(lexOne('"unicode \\u1234\\u5678\\u90AB\\uCDEF"')).to.contain({
       kind: TokenKind.STRING,
       start: 0,
       end: 34,
       value: 'unicode \u1234\u5678\u90AB\uCDEF',
     });
+
+    expect(lexOne('"unicode \\u{1234}\\u{5678}\\u{90AB}\\u{CDEF}"')).to.contain(
+      {
+        kind: TokenKind.STRING,
+        start: 0,
+        end: 42,
+        value: 'unicode \u1234\u5678\u90AB\uCDEF',
+      },
+    );
+
+    expect(
+      lexOne('"string with unicode escape outside BMP \\u{1F600}"'),
+    ).to.contain({
+      kind: TokenKind.STRING,
+      start: 0,
+      end: 50,
+      value: 'string with unicode escape outside BMP \u{1f600}',
+    });
+
+    expect(lexOne('"string with minimal unicode escape \\u{0}"')).to.contain({
+      kind: TokenKind.STRING,
+      start: 0,
+      end: 42,
+      value: 'string with minimal unicode escape \u{0}',
+    });
+
+    expect(
+      lexOne('"string with maximal unicode escape \\u{10FFFF}"'),
+    ).to.contain({
+      kind: TokenKind.STRING,
+      start: 0,
+      end: 47,
+      value: 'string with maximal unicode escape \u{10FFFF}',
+    });
+
+    expect(
+      lexOne('"string with maximal minimal unicode escape \\u{00000000}"'),
+    ).to.contain({
+      kind: TokenKind.STRING,
+      start: 0,
+      end: 57,
+      value: 'string with maximal minimal unicode escape \u{0}',
+    });
+
+    expect(
+      lexOne('"string with unicode surrogate pair escape \\uD83D\\uDE00"'),
+    ).to.contain({
+      kind: TokenKind.STRING,
+      start: 0,
+      end: 56,
+      value: 'string with unicode surrogate pair escape \u{1f600}',
+    });
+
+    expect(
+      lexOne('"string with minimal surrogate pair escape \\uD800\\uDC00"'),
+    ).to.contain({
+      kind: TokenKind.STRING,
+      start: 0,
+      end: 56,
+      value: 'string with minimal surrogate pair escape \u{10000}',
+    });
+
+    expect(
+      lexOne('"string with maximal surrogate pair escape \\uDBFF\\uDFFF"'),
+    ).to.contain({
+      kind: TokenKind.STRING,
+      start: 0,
+      end: 56,
+      value: 'string with maximal surrogate pair escape \u{10FFFF}',
+    });
   });
 
   it('lex reports useful string errors', () => {
@@ -298,16 +377,19 @@ describe('Lexer', () => {
       locations: [{ line: 1, column: 1 }],
     });
 
-    expectSyntaxError('"contains unescaped \u0007 control char"').to.deep.equal(
-      {
-        message: 'Syntax Error: Invalid character within String: U+0007.',
-        locations: [{ line: 1, column: 21 }],
-      },
-    );
+    expectSyntaxError('"bad surrogate \uDEAD"').to.deep.equal({
+      message: 'Syntax Error: Invalid character within String: U+DEAD.',
+      locations: [{ line: 1, column: 16 }],
+    });
+
+    expectSyntaxError('"bad high surrogate pair \uDEAD\uDEAD"').to.deep.equal({
+      message: 'Syntax Error: Invalid character within String: U+DEAD.',
+      locations: [{ line: 1, column: 26 }],
+    });
 
-    expectSyntaxError('"null-byte is not \u0000 end of file"').to.deep.equal({
-      message: 'Syntax Error: Invalid character within String: U+0000.',
-      locations: [{ line: 1, column: 19 }],
+    expectSyntaxError('"bad low surrogate pair \uD800\uD800"').to.deep.equal({
+      message: 'Syntax Error: Invalid character within String: U+D800.',
+      locations: [{ line: 1, column: 25 }],
     });
 
     expectSyntaxError('"multi\nline"').to.deep.equal({
@@ -354,6 +436,93 @@ describe('Lexer', () => {
       message: 'Syntax Error: Invalid Unicode escape sequence: "\\uXXXF".',
       locations: [{ line: 1, column: 6 }],
     });
+
+    expectSyntaxError('"bad \\u{} esc"').to.deep.equal({
+      message: 'Syntax Error: Invalid Unicode escape sequence: "\\u{}".',
+      locations: [{ line: 1, column: 6 }],
+    });
+
+    expectSyntaxError('"bad \\u{FXXX} esc"').to.deep.equal({
+      message: 'Syntax Error: Invalid Unicode escape sequence: "\\u{FX".',
+      locations: [{ line: 1, column: 6 }],
+    });
+
+    expectSyntaxError('"bad \\u{FFFF esc"').to.deep.equal({
+      message: 'Syntax Error: Invalid Unicode escape sequence: "\\u{FFFF ".',
+      locations: [{ line: 1, column: 6 }],
+    });
+
+    expectSyntaxError('"bad \\u{FFFF"').to.deep.equal({
+      message: 'Syntax Error: Invalid Unicode escape sequence: "\\u{FFFF"".',
+      locations: [{ line: 1, column: 6 }],
+    });
+
+    expectSyntaxError('"too high \\u{110000} esc"').to.deep.equal({
+      message: 'Syntax Error: Invalid Unicode escape sequence: "\\u{110000}".',
+      locations: [{ line: 1, column: 11 }],
+    });
+
+    expectSyntaxError('"way too high \\u{12345678} esc"').to.deep.equal({
+      message:
+        'Syntax Error: Invalid Unicode escape sequence: "\\u{12345678}".',
+      locations: [{ line: 1, column: 15 }],
+    });
+
+    expectSyntaxError('"too long \\u{000000000} esc"').to.deep.equal({
+      message:
+        'Syntax Error: Invalid Unicode escape sequence: "\\u{000000000".',
+      locations: [{ line: 1, column: 11 }],
+    });
+
+    expectSyntaxError('"bad surrogate \\uDEAD esc"').to.deep.equal({
+      message: 'Syntax Error: Invalid Unicode escape sequence: "\\uDEAD".',
+      locations: [{ line: 1, column: 16 }],
+    });
+
+    expectSyntaxError('"bad surrogate \\u{DEAD} esc"').to.deep.equal({
+      message: 'Syntax Error: Invalid Unicode escape sequence: "\\u{DEAD}".',
+      locations: [{ line: 1, column: 16 }],
+    });
+
+    expectSyntaxError(
+      '"cannot use braces for surrogate pair \\u{D83D}\\u{DE00} esc"',
+    ).to.deep.equal({
+      message: 'Syntax Error: Invalid Unicode escape sequence: "\\u{D83D}".',
+      locations: [{ line: 1, column: 39 }],
+    });
+
+    expectSyntaxError(
+      '"bad high surrogate pair \\uDEAD\\uDEAD esc"',
+    ).to.deep.equal({
+      message: 'Syntax Error: Invalid Unicode escape sequence: "\\uDEAD".',
+      locations: [{ line: 1, column: 26 }],
+    });
+
+    expectSyntaxError(
+      '"bad low surrogate pair \\uD800\\uD800 esc"',
+    ).to.deep.equal({
+      message: 'Syntax Error: Invalid Unicode escape sequence: "\\uD800".',
+      locations: [{ line: 1, column: 25 }],
+    });
+
+    expectSyntaxError(
+      '"cannot escape half a pair \uD83D\\uDE00 esc"',
+    ).to.deep.equal({
+      message: 'Syntax Error: Invalid character within String: U+D83D.',
+      locations: [{ line: 1, column: 28 }],
+    });
+
+    expectSyntaxError(
+      '"cannot escape half a pair \\uD83D\uDE00 esc"',
+    ).to.deep.equal({
+      message: 'Syntax Error: Invalid Unicode escape sequence: "\\uD83D".',
+      locations: [{ line: 1, column: 28 }],
+    });
+
+    expectSyntaxError('"bad \\uD83D\\not an escape"').to.deep.equal({
+      message: 'Syntax Error: Invalid Unicode escape sequence: "\\uD83D".',
+      locations: [{ line: 1, column: 6 }],
+    });
   });
 
   it('lexes block strings', () => {
@@ -413,6 +582,13 @@ describe('Lexer', () => {
       value: 'unescaped \\n\\r\\b\\t\\f\\u1234',
     });
 
+    expect(lexOne('"""unescaped unicode outside BMP \u{1f600}"""')).to.contain({
+      kind: TokenKind.BLOCK_STRING,
+      start: 0,
+      end: 38,
+      value: 'unescaped unicode outside BMP \u{1f600}',
+    });
+
     expect(lexOne('"""slashes \\\\ \\/"""')).to.contain({
       kind: TokenKind.BLOCK_STRING,
       start: 0,
@@ -485,18 +661,9 @@ describe('Lexer', () => {
       locations: [{ line: 1, column: 16 }],
     });
 
-    expectSyntaxError(
-      '"""contains unescaped \u0007 control char"""',
-    ).to.deep.equal({
-      message: 'Syntax Error: Invalid character within String: U+0007.',
-      locations: [{ line: 1, column: 23 }],
-    });
-
-    expectSyntaxError(
-      '"""null-byte is not \u0000 end of file"""',
-    ).to.deep.equal({
-      message: 'Syntax Error: Invalid character within String: U+0000.',
-      locations: [{ line: 1, column: 21 }],
+    expectSyntaxError('"""contains invalid surrogate \uDEAD"""').to.deep.equal({
+      message: 'Syntax Error: Invalid character within String: U+DEAD.',
+      locations: [{ line: 1, column: 31 }],
     });
   });
 
@@ -836,10 +1003,30 @@ describe('Lexer', () => {
       locations: [{ line: 1, column: 1 }],
     });
 
+    expectSyntaxError('\x00').to.deep.equal({
+      message: 'Syntax Error: Unexpected character: U+0000.',
+      locations: [{ line: 1, column: 1 }],
+    });
+
+    expectSyntaxError('\b').to.deep.equal({
+      message: 'Syntax Error: Unexpected character: U+0008.',
+      locations: [{ line: 1, column: 1 }],
+    });
+
     expectSyntaxError('\u203B').to.deep.equal({
       message: 'Syntax Error: Unexpected character: U+203B.',
       locations: [{ line: 1, column: 1 }],
     });
+
+    expectSyntaxError('\u{1f600}').to.deep.equal({
+      message: 'Syntax Error: Unexpected character: U+1F600.',
+      locations: [{ line: 1, column: 1 }],
+    });
+
+    expectSyntaxError('\uDEAD').to.deep.equal({
+      message: 'Syntax Error: Invalid character: U+DEAD.',
+      locations: [{ line: 1, column: 1 }],
+    });
   });
 
   it('lex reports useful information for dashes in names', () => {
@@ -920,9 +1107,15 @@ describe('Lexer', () => {
       end: 9,
       value: ' Comment',
     });
-    expectSyntaxError('# \u0007').to.deep.equal({
-      message: 'Syntax Error: Invalid character: U+0007.',
-      locations: [{ line: 1, column: 3 }],
+    expect(lexOne('# Comment \u{1f600}').prev).to.contain({
+      kind: TokenKind.COMMENT,
+      start: 0,
+      end: 12,
+      value: ' Comment \u{1f600}',
+    });
+    expectSyntaxError('# Invalid surrogate \uDEAD').to.deep.equal({
+      message: 'Syntax Error: Invalid character: U+DEAD.',
+      locations: [{ line: 1, column: 21 }],
     });
   });
 });

diff --git a/src/language/__tests__/printString-test.js b/src/language/__tests__/printString-test.js
@@ -0,0 +1,54 @@
+import { expect } from 'chai';
+import { describe, it } from 'mocha';
+
+import { printString } from '../printString';
+
+describe('printString', () => {
+  it('prints a simple string', () => {
+    expect(printString('hello world')).to.equal('"hello world"');
+  });
+
+  it('escapes quotes', () => {
+    expect(printString('"hello world"')).to.equal('"\\"hello world\\""');
+  });
+
+  it('does not escape single quote', () => {
+    expect(printString("who's test")).to.equal('"who\'s test"');
+  });
+
+  it('escapes backslashes', () => {
+    expect(printString('escape: \\n')).to.equal('"escape: \\\\n"');
+  });
+
+  it('escapes well-known control chars', () => {
+    expect(printString('\b\f\n\r\t')).to.equal('"\\b\\f\\n\\r\\t"');
+  });
+
+  it('escapes zero byte', () => {
+    expect(printString('\x00')).to.equal('"\\u0000"');
+  });
+
+  it('does not escape space', () => {
+    expect(printString(' ')).to.equal('" "');
+  });
+
+  it('escapes all other control chars', () => {
+    for (let i = 1; i <= 0x9f; i++) {
+      const source = String.fromCharCode(i);
+      if (/[\b\f\n\r\t]/.test(source) || (i >= 0x0020 && i <= 0x007e)) {
+        continue;
+      }
+      expect(printString(source)).to.equal(
+        `"\\u00${i <= 0x000f ? '0' : ''}${i.toString(16).toUpperCase()}"`,
+      );
+    }
+  });
+
+  it('does not escape non-ascii character', () => {
+    expect(printString('\u21BB')).to.equal('"\u21BB"');
+  });
+
+  it('does not escape supplementary character', () => {
+    expect(printString('\u{1f600}')).to.equal('"\u{1f600}"');
+  });
+});