From 38ee31093394745e53e5943dbf860424f5e49b2d Mon Sep 17 00:00:00 2001
From: Lee Byron <lee.byron@robinhood.com>
Date: Mon, 17 May 2021 13:42:26 -0700
Subject: [PATCH] RFC: Support full Unicode in lexer

Depends on #3115

Implements RFC at graphql/graphql-spec#849.

* Replaces `isSourceCharacter` with `isUnicodeScalarValue`
* Adds `isSupplementaryCodePoint`, used in String, BlockStrings, and Comments to ensure correct lexing of JavaScript's UTF-16 source.
* Updates `printCodePointAt` to correctly print supplementary code points.
* Adds variable-width Unicode escape sequences
* Adds explicit support for legacy JSON-style fixed-width Unicode escape sequence surrogate pairs.
* Adds `printString` to no longer rely on `JSON.stringify`. Borrows some implementation details from Node.js internals for string printing.

  Implements:

  > When producing a {StringValue}, implementations should use escape sequences to
  > represent non-printable control characters (U+0000 to U+001F and U+007F to
  > U+009F). Other escape sequences are not necessary, however an implementation may
  > use escape sequences to represent any other range of code points.

Closes #2449

Co-authored-by: Andreas Marek <andimarek@fastmail.fm>
---
 cspell.yml                                 |   3 +
 src/language/__tests__/lexer-test.ts       | 255 ++++++++++++++++++---
 src/language/__tests__/printString-test.ts |  82 +++++++
 src/language/lexer.ts                      | 154 +++++++++++--
 src/language/printString.ts                |  38 +++
 src/language/printer.ts                    |   3 +-
 src/type/__tests__/introspection-test.ts   |  46 ++++
 7 files changed, 531 insertions(+), 50 deletions(-)
 create mode 100644 src/language/__tests__/printString-test.ts
 create mode 100644 src/language/printString.ts

diff --git a/cspell.yml b/cspell.yml
index 36a6cf6e0c..e20b940fa2 100644
--- a/cspell.yml
+++ b/cspell.yml
@@ -20,6 +20,9 @@ overrides:
   - filename: '**/docs/APIReference-*.md'
     ignoreRegExpList: ['/href="[^"]*"/']
 
+ignoreRegExpList:
+  - u\{[0-9a-f]{1,8}\}
+
 words:
   - graphiql
   - sublinks
diff --git a/src/language/__tests__/lexer-test.ts b/src/language/__tests__/lexer-test.ts
index d98f68b051..8e26212a77 100644
--- a/src/language/__tests__/lexer-test.ts
+++ b/src/language/__tests__/lexer-test.ts
@@ -28,13 +28,6 @@ function expectSyntaxError(text: string) {
 }
 
 describe('Lexer', () => {
-  it('disallows uncommon control characters', () => {
-    expectSyntaxError('\u0007').to.deep.equal({
-      message: 'Syntax Error: Invalid character: U+0007.',
-      locations: [{ line: 1, column: 1 }],
-    });
-  });
-
   it('ignores BOM header', () => {
     expect(lexOne('\uFEFF foo')).to.contain({
       kind: TokenKind.NAME,
@@ -269,12 +262,98 @@ describe('Lexer', () => {
       value: 'slashes \\ /',
     });
 
+    expect(lexOne('"unescaped unicode outside BMP \u{1f600}"')).to.contain({
+      kind: TokenKind.STRING,
+      start: 0,
+      end: 34,
+      value: 'unescaped unicode outside BMP \u{1f600}',
+    });
+
+    expect(
+      lexOne('"unescaped maximal unicode outside BMP \u{10ffff}"'),
+    ).to.contain({
+      kind: TokenKind.STRING,
+      start: 0,
+      end: 42,
+      value: 'unescaped maximal unicode outside BMP \u{10ffff}',
+    });
+
     expect(lexOne('"unicode \\u1234\\u5678\\u90AB\\uCDEF"')).to.contain({
       kind: TokenKind.STRING,
       start: 0,
       end: 34,
       value: 'unicode \u1234\u5678\u90AB\uCDEF',
     });
+
+    expect(lexOne('"unicode \\u{1234}\\u{5678}\\u{90AB}\\u{CDEF}"')).to.contain(
+      {
+        kind: TokenKind.STRING,
+        start: 0,
+        end: 42,
+        value: 'unicode \u1234\u5678\u90AB\uCDEF',
+      },
+    );
+
+    expect(
+      lexOne('"string with unicode escape outside BMP \\u{1F600}"'),
+    ).to.contain({
+      kind: TokenKind.STRING,
+      start: 0,
+      end: 50,
+      value: 'string with unicode escape outside BMP \u{1f600}',
+    });
+
+    expect(lexOne('"string with minimal unicode escape \\u{0}"')).to.contain({
+      kind: TokenKind.STRING,
+      start: 0,
+      end: 42,
+      value: 'string with minimal unicode escape \u{0}',
+    });
+
+    expect(
+      lexOne('"string with maximal unicode escape \\u{10FFFF}"'),
+    ).to.contain({
+      kind: TokenKind.STRING,
+      start: 0,
+      end: 47,
+      value: 'string with maximal unicode escape \u{10FFFF}',
+    });
+
+    expect(
+      lexOne('"string with maximal minimal unicode escape \\u{00000000}"'),
+    ).to.contain({
+      kind: TokenKind.STRING,
+      start: 0,
+      end: 57,
+      value: 'string with maximal minimal unicode escape \u{0}',
+    });
+
+    expect(
+      lexOne('"string with unicode surrogate pair escape \\uD83D\\uDE00"'),
+    ).to.contain({
+      kind: TokenKind.STRING,
+      start: 0,
+      end: 56,
+      value: 'string with unicode surrogate pair escape \u{1f600}',
+    });
+
+    expect(
+      lexOne('"string with minimal surrogate pair escape \\uD800\\uDC00"'),
+    ).to.contain({
+      kind: TokenKind.STRING,
+      start: 0,
+      end: 56,
+      value: 'string with minimal surrogate pair escape \u{10000}',
+    });
+
+    expect(
+      lexOne('"string with maximal surrogate pair escape \\uDBFF\\uDFFF"'),
+    ).to.contain({
+      kind: TokenKind.STRING,
+      start: 0,
+      end: 56,
+      value: 'string with maximal surrogate pair escape \u{10FFFF}',
+    });
   });
 
   it('lex reports useful string errors', () => {
@@ -304,16 +383,19 @@ describe('Lexer', () => {
       locations: [{ line: 1, column: 1 }],
     });
 
-    expectSyntaxError('"contains unescaped \u0007 control char"').to.deep.equal(
-      {
-        message: 'Syntax Error: Invalid character within String: U+0007.',
-        locations: [{ line: 1, column: 21 }],
-      },
-    );
+    expectSyntaxError('"bad surrogate \uDEAD"').to.deep.equal({
+      message: 'Syntax Error: Invalid character within String: U+DEAD.',
+      locations: [{ line: 1, column: 16 }],
+    });
+
+    expectSyntaxError('"bad high surrogate pair \uDEAD\uDEAD"').to.deep.equal({
+      message: 'Syntax Error: Invalid character within String: U+DEAD.',
+      locations: [{ line: 1, column: 26 }],
+    });
 
-    expectSyntaxError('"null-byte is not \u0000 end of file"').to.deep.equal({
-      message: 'Syntax Error: Invalid character within String: U+0000.',
-      locations: [{ line: 1, column: 19 }],
+    expectSyntaxError('"bad low surrogate pair \uD800\uD800"').to.deep.equal({
+      message: 'Syntax Error: Invalid character within String: U+D800.',
+      locations: [{ line: 1, column: 25 }],
     });
 
     expectSyntaxError('"multi\nline"').to.deep.equal({
@@ -360,6 +442,93 @@ describe('Lexer', () => {
       message: 'Syntax Error: Invalid Unicode escape sequence: "\\uXXXF".',
       locations: [{ line: 1, column: 6 }],
     });
+
+    expectSyntaxError('"bad \\u{} esc"').to.deep.equal({
+      message: 'Syntax Error: Invalid Unicode escape sequence: "\\u{}".',
+      locations: [{ line: 1, column: 6 }],
+    });
+
+    expectSyntaxError('"bad \\u{FXXX} esc"').to.deep.equal({
+      message: 'Syntax Error: Invalid Unicode escape sequence: "\\u{FX".',
+      locations: [{ line: 1, column: 6 }],
+    });
+
+    expectSyntaxError('"bad \\u{FFFF esc"').to.deep.equal({
+      message: 'Syntax Error: Invalid Unicode escape sequence: "\\u{FFFF ".',
+      locations: [{ line: 1, column: 6 }],
+    });
+
+    expectSyntaxError('"bad \\u{FFFF"').to.deep.equal({
+      message: 'Syntax Error: Invalid Unicode escape sequence: "\\u{FFFF"".',
+      locations: [{ line: 1, column: 6 }],
+    });
+
+    expectSyntaxError('"too high \\u{110000} esc"').to.deep.equal({
+      message: 'Syntax Error: Invalid Unicode escape sequence: "\\u{110000}".',
+      locations: [{ line: 1, column: 11 }],
+    });
+
+    expectSyntaxError('"way too high \\u{12345678} esc"').to.deep.equal({
+      message:
+        'Syntax Error: Invalid Unicode escape sequence: "\\u{12345678}".',
+      locations: [{ line: 1, column: 15 }],
+    });
+
+    expectSyntaxError('"too long \\u{000000000} esc"').to.deep.equal({
+      message:
+        'Syntax Error: Invalid Unicode escape sequence: "\\u{000000000".',
+      locations: [{ line: 1, column: 11 }],
+    });
+
+    expectSyntaxError('"bad surrogate \\uDEAD esc"').to.deep.equal({
+      message: 'Syntax Error: Invalid Unicode escape sequence: "\\uDEAD".',
+      locations: [{ line: 1, column: 16 }],
+    });
+
+    expectSyntaxError('"bad surrogate \\u{DEAD} esc"').to.deep.equal({
+      message: 'Syntax Error: Invalid Unicode escape sequence: "\\u{DEAD}".',
+      locations: [{ line: 1, column: 16 }],
+    });
+
+    expectSyntaxError(
+      '"cannot use braces for surrogate pair \\u{D83D}\\u{DE00} esc"',
+    ).to.deep.equal({
+      message: 'Syntax Error: Invalid Unicode escape sequence: "\\u{D83D}".',
+      locations: [{ line: 1, column: 39 }],
+    });
+
+    expectSyntaxError(
+      '"bad high surrogate pair \\uDEAD\\uDEAD esc"',
+    ).to.deep.equal({
+      message: 'Syntax Error: Invalid Unicode escape sequence: "\\uDEAD".',
+      locations: [{ line: 1, column: 26 }],
+    });
+
+    expectSyntaxError(
+      '"bad low surrogate pair \\uD800\\uD800 esc"',
+    ).to.deep.equal({
+      message: 'Syntax Error: Invalid Unicode escape sequence: "\\uD800".',
+      locations: [{ line: 1, column: 25 }],
+    });
+
+    expectSyntaxError(
+      '"cannot escape half a pair \uD83D\\uDE00 esc"',
+    ).to.deep.equal({
+      message: 'Syntax Error: Invalid character within String: U+D83D.',
+      locations: [{ line: 1, column: 28 }],
+    });
+
+    expectSyntaxError(
+      '"cannot escape half a pair \\uD83D\uDE00 esc"',
+    ).to.deep.equal({
+      message: 'Syntax Error: Invalid Unicode escape sequence: "\\uD83D".',
+      locations: [{ line: 1, column: 28 }],
+    });
+
+    expectSyntaxError('"bad \\uD83D\\not an escape"').to.deep.equal({
+      message: 'Syntax Error: Invalid Unicode escape sequence: "\\uD83D".',
+      locations: [{ line: 1, column: 6 }],
+    });
   });
 
   it('lexes block strings', () => {
@@ -419,6 +588,13 @@ describe('Lexer', () => {
       value: 'unescaped \\n\\r\\b\\t\\f\\u1234',
     });
 
+    expect(lexOne('"""unescaped unicode outside BMP \u{1f600}"""')).to.contain({
+      kind: TokenKind.BLOCK_STRING,
+      start: 0,
+      end: 38,
+      value: 'unescaped unicode outside BMP \u{1f600}',
+    });
+
     expect(lexOne('"""slashes \\\\ \\/"""')).to.contain({
       kind: TokenKind.BLOCK_STRING,
       start: 0,
@@ -491,18 +667,9 @@ describe('Lexer', () => {
       locations: [{ line: 1, column: 16 }],
     });
 
-    expectSyntaxError(
-      '"""contains unescaped \u0007 control char"""',
-    ).to.deep.equal({
-      message: 'Syntax Error: Invalid character within String: U+0007.',
-      locations: [{ line: 1, column: 23 }],
-    });
-
-    expectSyntaxError(
-      '"""null-byte is not \u0000 end of file"""',
-    ).to.deep.equal({
-      message: 'Syntax Error: Invalid character within String: U+0000.',
-      locations: [{ line: 1, column: 21 }],
+    expectSyntaxError('"""contains invalid surrogate \uDEAD"""').to.deep.equal({
+      message: 'Syntax Error: Invalid character within String: U+DEAD.',
+      locations: [{ line: 1, column: 31 }],
     });
   });
 
@@ -842,6 +1009,16 @@ describe('Lexer', () => {
       locations: [{ line: 1, column: 1 }],
     });
 
+    expectSyntaxError('\x00').to.deep.equal({
+      message: 'Syntax Error: Unexpected character: U+0000.',
+      locations: [{ line: 1, column: 1 }],
+    });
+
+    expectSyntaxError('\b').to.deep.equal({
+      message: 'Syntax Error: Unexpected character: U+0008.',
+      locations: [{ line: 1, column: 1 }],
+    });
+
     expectSyntaxError('\u00AA').to.deep.equal({
       message: 'Syntax Error: Unexpected character: U+00AA.',
       locations: [{ line: 1, column: 1 }],
@@ -856,6 +1033,16 @@ describe('Lexer', () => {
       message: 'Syntax Error: Unexpected character: U+203B.',
       locations: [{ line: 1, column: 1 }],
     });
+
+    expectSyntaxError('\u{1f600}').to.deep.equal({
+      message: 'Syntax Error: Unexpected character: U+1F600.',
+      locations: [{ line: 1, column: 1 }],
+    });
+
+    expectSyntaxError('\uDEAD').to.deep.equal({
+      message: 'Syntax Error: Invalid character: U+DEAD.',
+      locations: [{ line: 1, column: 1 }],
+    });
   });
 
   it('lex reports useful information for dashes in names', () => {
@@ -936,9 +1123,15 @@ describe('Lexer', () => {
       end: 9,
       value: ' Comment',
     });
-    expectSyntaxError('# \u0007').to.deep.equal({
-      message: 'Syntax Error: Invalid character: U+0007.',
-      locations: [{ line: 1, column: 3 }],
+    expect(lexOne('# Comment \u{1f600}').prev).to.contain({
+      kind: TokenKind.COMMENT,
+      start: 0,
+      end: 12,
+      value: ' Comment \u{1f600}',
+    });
+    expectSyntaxError('# Invalid surrogate \uDEAD').to.deep.equal({
+      message: 'Syntax Error: Invalid character: U+DEAD.',
+      locations: [{ line: 1, column: 21 }],
     });
   });
 });
diff --git a/src/language/__tests__/printString-test.ts b/src/language/__tests__/printString-test.ts
new file mode 100644
index 0000000000..fff1bfeec0
--- /dev/null
+++ b/src/language/__tests__/printString-test.ts
@@ -0,0 +1,82 @@
+import { expect } from 'chai';
+import { describe, it } from 'mocha';
+
+import { printString } from '../printString';
+
+describe('printString', () => {
+  it('prints a simple string', () => {
+    expect(printString('hello world')).to.equal('"hello world"');
+  });
+
+  it('escapes quotes', () => {
+    expect(printString('"hello world"')).to.equal('"\\"hello world\\""');
+  });
+
+  it('does not escape single quote', () => {
+    expect(printString("who's test")).to.equal('"who\'s test"');
+  });
+
+  it('escapes backslashes', () => {
+    expect(printString('escape: \\')).to.equal('"escape: \\\\"');
+  });
+
+  it('escapes well-known control chars', () => {
+    expect(printString('\b\f\n\r\t')).to.equal('"\\b\\f\\n\\r\\t"');
+  });
+
+  it('escapes zero byte', () => {
+    expect(printString('\x00')).to.equal('"\\u0000"');
+  });
+
+  it('does not escape space', () => {
+    expect(printString(' ')).to.equal('" "');
+  });
+
+  it('does not escape non-ascii character', () => {
+    expect(printString('\u21BB')).to.equal('"\u21BB"');
+  });
+
+  it('does not escape supplementary character', () => {
+    expect(printString('\u{1f600}')).to.equal('"\u{1f600}"');
+  });
+
+  it('escapes all control chars', () => {
+    /* spellchecker:ignore abcdefghijklmnopqrstuvwxyz */
+    expect(
+      printString(
+        '\u0000\u0001\u0002\u0003\u0004\u0005\u0006\u0007' +
+          '\u0008\u0009\u000A\u000B\u000C\u000D\u000E\u000F' +
+          '\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017' +
+          '\u0018\u0019\u001A\u001B\u001C\u001D\u001E\u001F' +
+          '\u0020\u0021\u0022\u0023\u0024\u0025\u0026\u0027' +
+          '\u0028\u0029\u002A\u002B\u002C\u002D\u002E\u002F' +
+          '\u0030\u0031\u0032\u0033\u0034\u0035\u0036\u0037' +
+          '\u0038\u0039\u003A\u003B\u003C\u003D\u003E\u003F' +
+          '\u0040\u0041\u0042\u0043\u0044\u0045\u0046\u0047' +
+          '\u0048\u0049\u004A\u004B\u004C\u004D\u004E\u004F' +
+          '\u0050\u0051\u0052\u0053\u0054\u0055\u0056\u0057' +
+          '\u0058\u0059\u005A\u005B\u005C\u005D\u005E\u005F' +
+          '\u0060\u0061\u0062\u0063\u0064\u0065\u0066\u0067' +
+          '\u0068\u0069\u006A\u006B\u006C\u006D\u006E\u006F' +
+          '\u0070\u0071\u0072\u0073\u0074\u0075\u0076\u0077' +
+          '\u0078\u0079\u007A\u007B\u007C\u007D\u007E\u007F' +
+          '\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087' +
+          '\u0088\u0089\u008A\u008B\u008C\u008D\u008E\u008F' +
+          '\u0090\u0091\u0092\u0093\u0094\u0095\u0096\u0097' +
+          '\u0098\u0099\u009A\u009B\u009C\u009D\u009E\u009F',
+      ),
+    ).to.equal(
+      '"\\u0000\\u0001\\u0002\\u0003\\u0004\\u0005\\u0006\\u0007' +
+        '\\b\\t\\n\\u000B\\f\\r\\u000E\\u000F' +
+        '\\u0010\\u0011\\u0012\\u0013\\u0014\\u0015\\u0016\\u0017' +
+        '\\u0018\\u0019\\u001A\\u001B\\u001C\\u001D\\u001E\\u001F' +
+        ' !\\"#$%&\'()*+,-./0123456789:;<=>?' +
+        '@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\\\]^_' +
+        '`abcdefghijklmnopqrstuvwxyz{|}~\\u007F' +
+        '\\u0080\\u0081\\u0082\\u0083\\u0084\\u0085\\u0086\\u0087' +
+        '\\u0088\\u0089\\u008A\\u008B\\u008C\\u008D\\u008E\\u008F' +
+        '\\u0090\\u0091\\u0092\\u0093\\u0094\\u0095\\u0096\\u0097' +
+        '\\u0098\\u0099\\u009A\\u009B\\u009C\\u009D\\u009E\\u009F"',
+    );
+  });
+});
diff --git a/src/language/lexer.ts b/src/language/lexer.ts
index 5fdb6466ef..21e872522f 100644
--- a/src/language/lexer.ts
+++ b/src/language/lexer.ts
@@ -108,20 +108,53 @@ export function isPunctuatorTokenKind(kind: TokenKindEnum): boolean {
 }
 
 /**
- * ```
+ * A Unicode scalar value is any Unicode code point except surrogate code
+ * points. In other words, the inclusive ranges of values 0x0000 to 0xD7FF and
+ * 0xE000 to 0x10FFFF.
+ *
  * SourceCharacter ::
- *   - U+0009 (Horizontal Tab)
- *   - U+000A (New Line)
- *   - U+000D (Carriage Return)
- *   - U+0020-U+FFFF
- * ```
+ *   - "Any Unicode scalar value"
  */
-function isSourceCharacter(code: number): boolean {
+function isUnicodeScalarValue(code: number): boolean {
   return (
-    code >= 0x0020 || code === 0x0009 || code === 0x000a || code === 0x000d
+    (code >= 0x0000 && code <= 0xd7ff) || (code >= 0xe000 && code <= 0x10ffff)
+  );
+}
+
+/**
+ * The GraphQL specification defines source text as a sequence of unicode scalar
+ * values (which Unicode defines to exclude surrogate code points). However
+ * JavaScript defines strings as a sequence of UTF-16 code units which may
+ * include surrogates. A surrogate pair is a valid source character as it
+ * encodes a supplementary code point (above U+FFFF), but unpaired surrogate
+ * code points are not valid source characters.
+ */
+function isSupplementaryCodePoint(body: string, location: number): boolean {
+  return (
+    isLeadingSurrogate(body.charCodeAt(location)) &&
+    isTrailingSurrogate(body.charCodeAt(location + 1))
+  );
+}
+
+function isLeadingSurrogate(code: number): boolean {
+  return code >= 0xd800 && code <= 0xdbff;
+}
+
+function isTrailingSurrogate(code: number): boolean {
+  return code >= 0xdc00 && code <= 0xdfff;
+}
+
+function encodeSurrogatePair(point: number): string {
+  return String.fromCharCode(
+    0xd800 | ((point - 0x10000) >> 10), // Leading Surrogate
+    0xdc00 | ((point - 0x10000) & 0x3ff), // Trailing Surrogate
   );
 }
 
+function decodeSurrogatePair(leading: number, trailing: number): number {
+  return 0x10000 | ((leading & 0x03ff) << 10) | (trailing & 0x03ff);
+}
+
 /**
  * Prints the code point (or end of file reference) at a given location in a
  * source for use in error messages.
@@ -140,9 +173,12 @@ function printCodePointAt(lexer: Lexer, location: number): string {
     return code === 0x0022 ? "'\"'" : `"${body[location]}"`;
   }
   // Unicode code point
+  const point = isSupplementaryCodePoint(body, location)
+    ? decodeSurrogatePair(code, body.charCodeAt(location + 1))
+    : code;
   const zeroPad =
-    code > 0xfff ? '' : code > 0xff ? '0' : code > 0xf ? '00' : '000';
-  return `U+${zeroPad}${code.toString(16).toUpperCase()}`;
+    point > 0xfff ? '' : point > 0xff ? '0' : point > 0xf ? '00' : '000';
+  return `U+${zeroPad}${point.toString(16).toUpperCase()}`;
 }
 
 /**
@@ -286,7 +322,7 @@ function readNextToken(lexer: Lexer, start: number): Token {
       position,
       code === 0x0027
         ? 'Unexpected single quote character (\'), did you mean to use a double quote (")?'
-        : isSourceCharacter(code)
+        : isUnicodeScalarValue(code) || isSupplementaryCodePoint(body, position)
         ? `Unexpected character: ${printCodePointAt(lexer, position)}.`
         : `Invalid character: ${printCodePointAt(lexer, position)}.`,
     );
@@ -318,8 +354,10 @@ function readComment(lexer: Lexer, start: number): Token {
     }
 
     // SourceCharacter
-    if (isSourceCharacter(code)) {
+    if (isUnicodeScalarValue(code)) {
       ++position;
+    } else if (isSupplementaryCodePoint(body, position)) {
+      position += 2;
     } else {
       break;
     }
@@ -474,7 +512,9 @@ function readDigits(lexer: Lexer, start: number, firstCode: number): number {
  *   - `\u` EscapedUnicode
  *   - `\` EscapedCharacter
  *
- * EscapedUnicode :: /[0-9A-Fa-f]{4}/
+ * EscapedUnicode ::
+ *   - `{` HexDigit+ `}`
+ *   - HexDigit HexDigit HexDigit HexDigit
  *
  * EscapedCharacter :: one of `"` `\` `/` `b` `f` `n` `r` `t`
  * ```
@@ -500,7 +540,9 @@ function readString(lexer: Lexer, start: number): Token {
       value += body.slice(chunkStart, position);
       const escape =
         body.charCodeAt(position + 1) === 0x0075 // u
-          ? readEscapedUnicode(lexer, position)
+          ? body.charCodeAt(position + 2) === 0x007b // {
+            ? readEscapedUnicodeVariableWidth(lexer, position)
+            : readEscapedUnicodeFixedWidth(lexer, position)
           : readEscapedCharacter(lexer, position);
       value += escape.value;
       position += escape.size;
@@ -514,8 +556,10 @@ function readString(lexer: Lexer, start: number): Token {
     }
 
     // SourceCharacter
-    if (isSourceCharacter(code)) {
+    if (isUnicodeScalarValue(code)) {
       ++position;
+    } else if (isSupplementaryCodePoint(body, position)) {
+      position += 2;
     } else {
       throw syntaxError(
         lexer.source,
@@ -537,14 +581,81 @@ interface EscapeSequence {
   size: number;
 }
 
-function readEscapedUnicode(lexer: Lexer, position: number): EscapeSequence {
+function readEscapedUnicodeVariableWidth(
+  lexer: Lexer,
+  position: number,
+): EscapeSequence {
+  const body = lexer.source.body;
+  let point = 0;
+  let size = 3;
+  // Cannot be larger than 12 chars (\u{00000000}).
+  while (size < 12) {
+    const code = body.charCodeAt(position + size++);
+    // Closing Brace (})
+    if (code === 0x007d) {
+      // Must be at least 5 chars (\u{0}) and encode a Unicode scalar value.
+      if (size < 5 || !isUnicodeScalarValue(point)) {
+        break;
+      }
+      // JavaScript defines strings as a sequence of UTF-16 code units and
+      // encodes Unicode code points above U+FFFF using a surrogate pair.
+      return {
+        value:
+          point <= 0xffff
+            ? String.fromCharCode(point)
+            : encodeSurrogatePair(point),
+        size,
+      };
+    }
+    // Append this hex digit to the code point.
+    point = (point << 4) | readHexDigit(code);
+    if (point < 0) {
+      break;
+    }
+  }
+
+  throw syntaxError(
+    lexer.source,
+    position,
+    `Invalid Unicode escape sequence: "${body.slice(
+      position,
+      position + size,
+    )}".`,
+  );
+}
+
+function readEscapedUnicodeFixedWidth(
+  lexer: Lexer,
+  position: number,
+): EscapeSequence {
   const body = lexer.source.body;
   const code = read16BitHexCode(body, position + 2);
 
-  if (code >= 0) {
+  if (isUnicodeScalarValue(code)) {
     return { value: String.fromCharCode(code), size: 6 };
   }
 
+  // GraphQL allows JSON-style surrogate pair escape sequences, but only when
+  // a valid pair is formed.
+  if (isLeadingSurrogate(code)) {
+    // \u
+    if (
+      body.charCodeAt(position + 6) === 0x005c &&
+      body.charCodeAt(position + 7) === 0x0075
+    ) {
+      const trailingCode = read16BitHexCode(body, position + 8);
+      if (isTrailingSurrogate(trailingCode)) {
+        // JavaScript defines strings as a sequence of UTF-16 code units and
+        // encodes Unicode code points above U+FFFF using a surrogate pair of
+        // code units. Since this is a surrogate pair escape sequence, just
+        // include both codes into the JavaScript string value. Had JavaScript
+        // not been internally based on UTF-16, then this surrogate pair would
+        // be decoded to retrieve the supplementary code point.
+        return { value: String.fromCharCode(code, trailingCode), size: 12 };
+      }
+    }
+  }
+
   throw syntaxError(
     lexer.source,
     position,
@@ -578,6 +689,11 @@ function read16BitHexCode(body: string, position: number): number {
  * 'a' becomes 10, 'f' becomes 15
  *
  * Returns -1 if the provided character code was not a valid hexadecimal digit.
+ *
+ * HexDigit :: one of
+ *   - `0` `1` `2` `3` `4` `5` `6` `7` `8` `9`
+ *   - `A` `B` `C` `D` `E` `F`
+ *   - `a` `b` `c` `d` `e` `f`
  */
 function readHexDigit(code: number): number {
   return code >= 0x0030 && code <= 0x0039 // 0-9
@@ -696,8 +812,10 @@ function readBlockString(lexer: Lexer, start: number): Token {
     }
 
     // SourceCharacter
-    if (isSourceCharacter(code)) {
+    if (isUnicodeScalarValue(code)) {
       ++position;
+    } else if (isSupplementaryCodePoint(body, position)) {
+      position += 2;
     } else {
       throw syntaxError(
         lexer.source,
diff --git a/src/language/printString.ts b/src/language/printString.ts
new file mode 100644
index 0000000000..b091bcc2c1
--- /dev/null
+++ b/src/language/printString.ts
@@ -0,0 +1,38 @@
+/**
+ * Prints a string as a GraphQL StringValue literal. Replaces control characters
+ * and excluded characters (" U+0022 and \\ U+005C) with escape sequences.
+ */
+export function printString(str: string): string {
+  return `"${str.replace(escapedRegExp, escapedReplacer)}"`;
+}
+
+// eslint-disable-next-line no-control-regex
+const escapedRegExp = /[\x00-\x1f\x22\x5c\x7f-\x9f]/g;
+
+function escapedReplacer(str: string): string {
+  return escapeSequences[str.charCodeAt(0)];
+}
+
+// prettier-ignore
+const escapeSequences = [
+  '\\u0000', '\\u0001', '\\u0002', '\\u0003', '\\u0004', '\\u0005', '\\u0006', '\\u0007',
+  '\\b',     '\\t',     '\\n',     '\\u000B', '\\f',     '\\r',     '\\u000E', '\\u000F',
+  '\\u0010', '\\u0011', '\\u0012', '\\u0013', '\\u0014', '\\u0015', '\\u0016', '\\u0017',
+  '\\u0018', '\\u0019', '\\u001A', '\\u001B', '\\u001C', '\\u001D', '\\u001E', '\\u001F',
+  '',        '',        '\\"',     '',        '',        '',        '',        '',
+  '',        '',        '',        '',        '',        '',        '',        '', // 2F
+  '',        '',        '',        '',        '',        '',        '',        '',
+  '',        '',        '',        '',        '',        '',        '',        '', // 3F
+  '',        '',        '',        '',        '',        '',        '',        '',
+  '',        '',        '',        '',        '',        '',        '',        '', // 4F
+  '',        '',        '',        '',        '',        '',        '',        '',
+  '',        '',        '',        '',        '\\\\',    '',        '',        '', // 5F
+  '',        '',        '',        '',        '',        '',        '',        '',
+  '',        '',        '',        '',        '',        '',        '',        '', // 6F
+  '',        '',        '',        '',        '',        '',        '',        '',
+  '',        '',        '',        '',        '',        '',        '',        '\\u007F',
+  '\\u0080', '\\u0081', '\\u0082', '\\u0083', '\\u0084', '\\u0085', '\\u0086', '\\u0087',
+  '\\u0088', '\\u0089', '\\u008A', '\\u008B', '\\u008C', '\\u008D', '\\u008E', '\\u008F',
+  '\\u0090', '\\u0091', '\\u0092', '\\u0093', '\\u0094', '\\u0095', '\\u0096', '\\u0097',
+  '\\u0098', '\\u0099', '\\u009A', '\\u009B', '\\u009C', '\\u009D', '\\u009E', '\\u009F',
+];
diff --git a/src/language/printer.ts b/src/language/printer.ts
index 2134ea1ee6..b6df7d80f9 100644
--- a/src/language/printer.ts
+++ b/src/language/printer.ts
@@ -4,6 +4,7 @@ import type { ASTNode } from './ast';
 import type { ASTReducer } from './visitor';
 import { visit } from './visitor';
 import { printBlockString } from './blockString';
+import { printString } from './printString';
 
 /**
  * Converts an AST into a string, using one set of reasonable
@@ -109,7 +110,7 @@ const printDocASTReducer: ASTReducer<string> = {
   FloatValue: { leave: ({ value }) => value },
   StringValue: {
     leave: ({ value, block: isBlockString }) =>
-      isBlockString ? printBlockString(value) : JSON.stringify(value),
+      isBlockString ? printBlockString(value) : printString(value),
   },
   BooleanValue: { leave: ({ value }) => (value ? 'true' : 'false') },
   NullValue: { leave: () => 'null' },
diff --git a/src/type/__tests__/introspection-test.ts b/src/type/__tests__/introspection-test.ts
index 0a480c3e71..4d5f1398d1 100644
--- a/src/type/__tests__/introspection-test.ts
+++ b/src/type/__tests__/introspection-test.ts
@@ -1070,6 +1070,52 @@ describe('Introspection', () => {
     });
   });
 
+  it('introspects any default value', () => {
+    const schema = buildSchema(`
+      input InputObjectWithDefaultValues {
+        a: String = "Emoji: \\u{1F600}"
+        b: Complex = {x: ["abc"], y: 123}
+      }
+
+      input Complex {
+        x: [String]
+        y: Int
+      }
+
+      type Query {
+        someField(someArg: InputObjectWithDefaultValues): String
+      }
+    `);
+
+    const source = `
+      {
+        __type(name: "InputObjectWithDefaultValues") {
+          inputFields {
+            name
+            defaultValue
+          }
+        }
+      }
+    `;
+
+    expect(graphqlSync({ schema, source })).to.deep.equal({
+      data: {
+        __type: {
+          inputFields: [
+            {
+              name: 'a',
+              defaultValue: '"Emoji: \u{1F600}"',
+            },
+            {
+              name: 'b',
+              defaultValue: '{x: ["abc"], y: 123}',
+            },
+          ],
+        },
+      },
+    });
+  });
+
   it('supports the __type root field', () => {
     const schema = buildSchema(`
       type Query {