From 38ee31093394745e53e5943dbf860424f5e49b2d Mon Sep 17 00:00:00 2001 From: Lee Byron Date: Mon, 17 May 2021 13:42:26 -0700 Subject: [PATCH] RFC: Support full Unicode in lexer Depends on #3115 Implements RFC at graphql/graphql-spec#849. * Replaces `isSourceCharacter` with `isUnicodeScalarValue` * Adds `isSupplementaryCodePoint`, used in String, BlockStrings, and Comments to ensure correct lexing of JavaScript's UTF-16 source. * Updates `printCodePointAt` to correctly print supplementary code points. * Adds variable-width Unicode escape sequences * Adds explicit support for legacy JSON-style fixed-width Unicode escape sequence surrogate pairs. * Adds `printString` to no longer rely on `JSON.stringify`. Borrows some implementation details from Node.js internals for string printing. Implements: > When producing a {StringValue}, implementations should use escape sequences to > represent non-printable control characters (U+0000 to U+001F and U+007F to > U+009F). Other escape sequences are not necessary, however an implementation may > use escape sequences to represent any other range of code points. Closes #2449 Co-authored-by: Andreas Marek --- cspell.yml | 3 + src/language/__tests__/lexer-test.ts | 255 ++++++++++++++++++--- src/language/__tests__/printString-test.ts | 82 +++++++ src/language/lexer.ts | 154 +++++++++++-- src/language/printString.ts | 38 +++ src/language/printer.ts | 3 +- src/type/__tests__/introspection-test.ts | 46 ++++ 7 files changed, 531 insertions(+), 50 deletions(-) create mode 100644 src/language/__tests__/printString-test.ts create mode 100644 src/language/printString.ts diff --git a/cspell.yml b/cspell.yml index 36a6cf6e0c..e20b940fa2 100644 --- a/cspell.yml +++ b/cspell.yml @@ -20,6 +20,9 @@ overrides: - filename: '**/docs/APIReference-*.md' ignoreRegExpList: ['/href="[^"]*"/'] +ignoreRegExpList: + - u\{[0-9a-f]{1,8}\} + words: - graphiql - sublinks diff --git a/src/language/__tests__/lexer-test.ts b/src/language/__tests__/lexer-test.ts index d98f68b051..8e26212a77 100644 --- a/src/language/__tests__/lexer-test.ts +++ b/src/language/__tests__/lexer-test.ts @@ -28,13 +28,6 @@ function expectSyntaxError(text: string) { } describe('Lexer', () => { - it('disallows uncommon control characters', () => { - expectSyntaxError('\u0007').to.deep.equal({ - message: 'Syntax Error: Invalid character: U+0007.', - locations: [{ line: 1, column: 1 }], - }); - }); - it('ignores BOM header', () => { expect(lexOne('\uFEFF foo')).to.contain({ kind: TokenKind.NAME, @@ -269,12 +262,98 @@ describe('Lexer', () => { value: 'slashes \\ /', }); + expect(lexOne('"unescaped unicode outside BMP \u{1f600}"')).to.contain({ + kind: TokenKind.STRING, + start: 0, + end: 34, + value: 'unescaped unicode outside BMP \u{1f600}', + }); + + expect( + lexOne('"unescaped maximal unicode outside BMP \u{10ffff}"'), + ).to.contain({ + kind: TokenKind.STRING, + start: 0, + end: 42, + value: 'unescaped maximal unicode outside BMP \u{10ffff}', + }); + expect(lexOne('"unicode \\u1234\\u5678\\u90AB\\uCDEF"')).to.contain({ kind: TokenKind.STRING, start: 0, end: 34, value: 'unicode \u1234\u5678\u90AB\uCDEF', }); + + expect(lexOne('"unicode \\u{1234}\\u{5678}\\u{90AB}\\u{CDEF}"')).to.contain( + { + kind: TokenKind.STRING, + start: 0, + end: 42, + value: 'unicode \u1234\u5678\u90AB\uCDEF', + }, + ); + + expect( + lexOne('"string with unicode escape outside BMP \\u{1F600}"'), + ).to.contain({ + kind: TokenKind.STRING, + start: 0, + end: 50, + value: 'string with unicode escape outside BMP \u{1f600}', + }); + + expect(lexOne('"string with minimal unicode escape \\u{0}"')).to.contain({ + kind: TokenKind.STRING, + start: 0, + end: 42, + value: 'string with minimal unicode escape \u{0}', + }); + + expect( + lexOne('"string with maximal unicode escape \\u{10FFFF}"'), + ).to.contain({ + kind: TokenKind.STRING, + start: 0, + end: 47, + value: 'string with maximal unicode escape \u{10FFFF}', + }); + + expect( + lexOne('"string with maximal minimal unicode escape \\u{00000000}"'), + ).to.contain({ + kind: TokenKind.STRING, + start: 0, + end: 57, + value: 'string with maximal minimal unicode escape \u{0}', + }); + + expect( + lexOne('"string with unicode surrogate pair escape \\uD83D\\uDE00"'), + ).to.contain({ + kind: TokenKind.STRING, + start: 0, + end: 56, + value: 'string with unicode surrogate pair escape \u{1f600}', + }); + + expect( + lexOne('"string with minimal surrogate pair escape \\uD800\\uDC00"'), + ).to.contain({ + kind: TokenKind.STRING, + start: 0, + end: 56, + value: 'string with minimal surrogate pair escape \u{10000}', + }); + + expect( + lexOne('"string with maximal surrogate pair escape \\uDBFF\\uDFFF"'), + ).to.contain({ + kind: TokenKind.STRING, + start: 0, + end: 56, + value: 'string with maximal surrogate pair escape \u{10FFFF}', + }); }); it('lex reports useful string errors', () => { @@ -304,16 +383,19 @@ describe('Lexer', () => { locations: [{ line: 1, column: 1 }], }); - expectSyntaxError('"contains unescaped \u0007 control char"').to.deep.equal( - { - message: 'Syntax Error: Invalid character within String: U+0007.', - locations: [{ line: 1, column: 21 }], - }, - ); + expectSyntaxError('"bad surrogate \uDEAD"').to.deep.equal({ + message: 'Syntax Error: Invalid character within String: U+DEAD.', + locations: [{ line: 1, column: 16 }], + }); + + expectSyntaxError('"bad high surrogate pair \uDEAD\uDEAD"').to.deep.equal({ + message: 'Syntax Error: Invalid character within String: U+DEAD.', + locations: [{ line: 1, column: 26 }], + }); - expectSyntaxError('"null-byte is not \u0000 end of file"').to.deep.equal({ - message: 'Syntax Error: Invalid character within String: U+0000.', - locations: [{ line: 1, column: 19 }], + expectSyntaxError('"bad low surrogate pair \uD800\uD800"').to.deep.equal({ + message: 'Syntax Error: Invalid character within String: U+D800.', + locations: [{ line: 1, column: 25 }], }); expectSyntaxError('"multi\nline"').to.deep.equal({ @@ -360,6 +442,93 @@ describe('Lexer', () => { message: 'Syntax Error: Invalid Unicode escape sequence: "\\uXXXF".', locations: [{ line: 1, column: 6 }], }); + + expectSyntaxError('"bad \\u{} esc"').to.deep.equal({ + message: 'Syntax Error: Invalid Unicode escape sequence: "\\u{}".', + locations: [{ line: 1, column: 6 }], + }); + + expectSyntaxError('"bad \\u{FXXX} esc"').to.deep.equal({ + message: 'Syntax Error: Invalid Unicode escape sequence: "\\u{FX".', + locations: [{ line: 1, column: 6 }], + }); + + expectSyntaxError('"bad \\u{FFFF esc"').to.deep.equal({ + message: 'Syntax Error: Invalid Unicode escape sequence: "\\u{FFFF ".', + locations: [{ line: 1, column: 6 }], + }); + + expectSyntaxError('"bad \\u{FFFF"').to.deep.equal({ + message: 'Syntax Error: Invalid Unicode escape sequence: "\\u{FFFF"".', + locations: [{ line: 1, column: 6 }], + }); + + expectSyntaxError('"too high \\u{110000} esc"').to.deep.equal({ + message: 'Syntax Error: Invalid Unicode escape sequence: "\\u{110000}".', + locations: [{ line: 1, column: 11 }], + }); + + expectSyntaxError('"way too high \\u{12345678} esc"').to.deep.equal({ + message: + 'Syntax Error: Invalid Unicode escape sequence: "\\u{12345678}".', + locations: [{ line: 1, column: 15 }], + }); + + expectSyntaxError('"too long \\u{000000000} esc"').to.deep.equal({ + message: + 'Syntax Error: Invalid Unicode escape sequence: "\\u{000000000".', + locations: [{ line: 1, column: 11 }], + }); + + expectSyntaxError('"bad surrogate \\uDEAD esc"').to.deep.equal({ + message: 'Syntax Error: Invalid Unicode escape sequence: "\\uDEAD".', + locations: [{ line: 1, column: 16 }], + }); + + expectSyntaxError('"bad surrogate \\u{DEAD} esc"').to.deep.equal({ + message: 'Syntax Error: Invalid Unicode escape sequence: "\\u{DEAD}".', + locations: [{ line: 1, column: 16 }], + }); + + expectSyntaxError( + '"cannot use braces for surrogate pair \\u{D83D}\\u{DE00} esc"', + ).to.deep.equal({ + message: 'Syntax Error: Invalid Unicode escape sequence: "\\u{D83D}".', + locations: [{ line: 1, column: 39 }], + }); + + expectSyntaxError( + '"bad high surrogate pair \\uDEAD\\uDEAD esc"', + ).to.deep.equal({ + message: 'Syntax Error: Invalid Unicode escape sequence: "\\uDEAD".', + locations: [{ line: 1, column: 26 }], + }); + + expectSyntaxError( + '"bad low surrogate pair \\uD800\\uD800 esc"', + ).to.deep.equal({ + message: 'Syntax Error: Invalid Unicode escape sequence: "\\uD800".', + locations: [{ line: 1, column: 25 }], + }); + + expectSyntaxError( + '"cannot escape half a pair \uD83D\\uDE00 esc"', + ).to.deep.equal({ + message: 'Syntax Error: Invalid character within String: U+D83D.', + locations: [{ line: 1, column: 28 }], + }); + + expectSyntaxError( + '"cannot escape half a pair \\uD83D\uDE00 esc"', + ).to.deep.equal({ + message: 'Syntax Error: Invalid Unicode escape sequence: "\\uD83D".', + locations: [{ line: 1, column: 28 }], + }); + + expectSyntaxError('"bad \\uD83D\\not an escape"').to.deep.equal({ + message: 'Syntax Error: Invalid Unicode escape sequence: "\\uD83D".', + locations: [{ line: 1, column: 6 }], + }); }); it('lexes block strings', () => { @@ -419,6 +588,13 @@ describe('Lexer', () => { value: 'unescaped \\n\\r\\b\\t\\f\\u1234', }); + expect(lexOne('"""unescaped unicode outside BMP \u{1f600}"""')).to.contain({ + kind: TokenKind.BLOCK_STRING, + start: 0, + end: 38, + value: 'unescaped unicode outside BMP \u{1f600}', + }); + expect(lexOne('"""slashes \\\\ \\/"""')).to.contain({ kind: TokenKind.BLOCK_STRING, start: 0, @@ -491,18 +667,9 @@ describe('Lexer', () => { locations: [{ line: 1, column: 16 }], }); - expectSyntaxError( - '"""contains unescaped \u0007 control char"""', - ).to.deep.equal({ - message: 'Syntax Error: Invalid character within String: U+0007.', - locations: [{ line: 1, column: 23 }], - }); - - expectSyntaxError( - '"""null-byte is not \u0000 end of file"""', - ).to.deep.equal({ - message: 'Syntax Error: Invalid character within String: U+0000.', - locations: [{ line: 1, column: 21 }], + expectSyntaxError('"""contains invalid surrogate \uDEAD"""').to.deep.equal({ + message: 'Syntax Error: Invalid character within String: U+DEAD.', + locations: [{ line: 1, column: 31 }], }); }); @@ -842,6 +1009,16 @@ describe('Lexer', () => { locations: [{ line: 1, column: 1 }], }); + expectSyntaxError('\x00').to.deep.equal({ + message: 'Syntax Error: Unexpected character: U+0000.', + locations: [{ line: 1, column: 1 }], + }); + + expectSyntaxError('\b').to.deep.equal({ + message: 'Syntax Error: Unexpected character: U+0008.', + locations: [{ line: 1, column: 1 }], + }); + expectSyntaxError('\u00AA').to.deep.equal({ message: 'Syntax Error: Unexpected character: U+00AA.', locations: [{ line: 1, column: 1 }], @@ -856,6 +1033,16 @@ describe('Lexer', () => { message: 'Syntax Error: Unexpected character: U+203B.', locations: [{ line: 1, column: 1 }], }); + + expectSyntaxError('\u{1f600}').to.deep.equal({ + message: 'Syntax Error: Unexpected character: U+1F600.', + locations: [{ line: 1, column: 1 }], + }); + + expectSyntaxError('\uDEAD').to.deep.equal({ + message: 'Syntax Error: Invalid character: U+DEAD.', + locations: [{ line: 1, column: 1 }], + }); }); it('lex reports useful information for dashes in names', () => { @@ -936,9 +1123,15 @@ describe('Lexer', () => { end: 9, value: ' Comment', }); - expectSyntaxError('# \u0007').to.deep.equal({ - message: 'Syntax Error: Invalid character: U+0007.', - locations: [{ line: 1, column: 3 }], + expect(lexOne('# Comment \u{1f600}').prev).to.contain({ + kind: TokenKind.COMMENT, + start: 0, + end: 12, + value: ' Comment \u{1f600}', + }); + expectSyntaxError('# Invalid surrogate \uDEAD').to.deep.equal({ + message: 'Syntax Error: Invalid character: U+DEAD.', + locations: [{ line: 1, column: 21 }], }); }); }); diff --git a/src/language/__tests__/printString-test.ts b/src/language/__tests__/printString-test.ts new file mode 100644 index 0000000000..fff1bfeec0 --- /dev/null +++ b/src/language/__tests__/printString-test.ts @@ -0,0 +1,82 @@ +import { expect } from 'chai'; +import { describe, it } from 'mocha'; + +import { printString } from '../printString'; + +describe('printString', () => { + it('prints a simple string', () => { + expect(printString('hello world')).to.equal('"hello world"'); + }); + + it('escapes quotes', () => { + expect(printString('"hello world"')).to.equal('"\\"hello world\\""'); + }); + + it('does not escape single quote', () => { + expect(printString("who's test")).to.equal('"who\'s test"'); + }); + + it('escapes backslashes', () => { + expect(printString('escape: \\')).to.equal('"escape: \\\\"'); + }); + + it('escapes well-known control chars', () => { + expect(printString('\b\f\n\r\t')).to.equal('"\\b\\f\\n\\r\\t"'); + }); + + it('escapes zero byte', () => { + expect(printString('\x00')).to.equal('"\\u0000"'); + }); + + it('does not escape space', () => { + expect(printString(' ')).to.equal('" "'); + }); + + it('does not escape non-ascii character', () => { + expect(printString('\u21BB')).to.equal('"\u21BB"'); + }); + + it('does not escape supplementary character', () => { + expect(printString('\u{1f600}')).to.equal('"\u{1f600}"'); + }); + + it('escapes all control chars', () => { + /* spellchecker:ignore abcdefghijklmnopqrstuvwxyz */ + expect( + printString( + '\u0000\u0001\u0002\u0003\u0004\u0005\u0006\u0007' + + '\u0008\u0009\u000A\u000B\u000C\u000D\u000E\u000F' + + '\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017' + + '\u0018\u0019\u001A\u001B\u001C\u001D\u001E\u001F' + + '\u0020\u0021\u0022\u0023\u0024\u0025\u0026\u0027' + + '\u0028\u0029\u002A\u002B\u002C\u002D\u002E\u002F' + + '\u0030\u0031\u0032\u0033\u0034\u0035\u0036\u0037' + + '\u0038\u0039\u003A\u003B\u003C\u003D\u003E\u003F' + + '\u0040\u0041\u0042\u0043\u0044\u0045\u0046\u0047' + + '\u0048\u0049\u004A\u004B\u004C\u004D\u004E\u004F' + + '\u0050\u0051\u0052\u0053\u0054\u0055\u0056\u0057' + + '\u0058\u0059\u005A\u005B\u005C\u005D\u005E\u005F' + + '\u0060\u0061\u0062\u0063\u0064\u0065\u0066\u0067' + + '\u0068\u0069\u006A\u006B\u006C\u006D\u006E\u006F' + + '\u0070\u0071\u0072\u0073\u0074\u0075\u0076\u0077' + + '\u0078\u0079\u007A\u007B\u007C\u007D\u007E\u007F' + + '\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087' + + '\u0088\u0089\u008A\u008B\u008C\u008D\u008E\u008F' + + '\u0090\u0091\u0092\u0093\u0094\u0095\u0096\u0097' + + '\u0098\u0099\u009A\u009B\u009C\u009D\u009E\u009F', + ), + ).to.equal( + '"\\u0000\\u0001\\u0002\\u0003\\u0004\\u0005\\u0006\\u0007' + + '\\b\\t\\n\\u000B\\f\\r\\u000E\\u000F' + + '\\u0010\\u0011\\u0012\\u0013\\u0014\\u0015\\u0016\\u0017' + + '\\u0018\\u0019\\u001A\\u001B\\u001C\\u001D\\u001E\\u001F' + + ' !\\"#$%&\'()*+,-./0123456789:;<=>?' + + '@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\\\]^_' + + '`abcdefghijklmnopqrstuvwxyz{|}~\\u007F' + + '\\u0080\\u0081\\u0082\\u0083\\u0084\\u0085\\u0086\\u0087' + + '\\u0088\\u0089\\u008A\\u008B\\u008C\\u008D\\u008E\\u008F' + + '\\u0090\\u0091\\u0092\\u0093\\u0094\\u0095\\u0096\\u0097' + + '\\u0098\\u0099\\u009A\\u009B\\u009C\\u009D\\u009E\\u009F"', + ); + }); +}); diff --git a/src/language/lexer.ts b/src/language/lexer.ts index 5fdb6466ef..21e872522f 100644 --- a/src/language/lexer.ts +++ b/src/language/lexer.ts @@ -108,20 +108,53 @@ export function isPunctuatorTokenKind(kind: TokenKindEnum): boolean { } /** - * ``` + * A Unicode scalar value is any Unicode code point except surrogate code + * points. In other words, the inclusive ranges of values 0x0000 to 0xD7FF and + * 0xE000 to 0x10FFFF. + * * SourceCharacter :: - * - U+0009 (Horizontal Tab) - * - U+000A (New Line) - * - U+000D (Carriage Return) - * - U+0020-U+FFFF - * ``` + * - "Any Unicode scalar value" */ -function isSourceCharacter(code: number): boolean { +function isUnicodeScalarValue(code: number): boolean { return ( - code >= 0x0020 || code === 0x0009 || code === 0x000a || code === 0x000d + (code >= 0x0000 && code <= 0xd7ff) || (code >= 0xe000 && code <= 0x10ffff) + ); +} + +/** + * The GraphQL specification defines source text as a sequence of unicode scalar + * values (which Unicode defines to exclude surrogate code points). However + * JavaScript defines strings as a sequence of UTF-16 code units which may + * include surrogates. A surrogate pair is a valid source character as it + * encodes a supplementary code point (above U+FFFF), but unpaired surrogate + * code points are not valid source characters. + */ +function isSupplementaryCodePoint(body: string, location: number): boolean { + return ( + isLeadingSurrogate(body.charCodeAt(location)) && + isTrailingSurrogate(body.charCodeAt(location + 1)) + ); +} + +function isLeadingSurrogate(code: number): boolean { + return code >= 0xd800 && code <= 0xdbff; +} + +function isTrailingSurrogate(code: number): boolean { + return code >= 0xdc00 && code <= 0xdfff; +} + +function encodeSurrogatePair(point: number): string { + return String.fromCharCode( + 0xd800 | ((point - 0x10000) >> 10), // Leading Surrogate + 0xdc00 | ((point - 0x10000) & 0x3ff), // Trailing Surrogate ); } +function decodeSurrogatePair(leading: number, trailing: number): number { + return 0x10000 | ((leading & 0x03ff) << 10) | (trailing & 0x03ff); +} + /** * Prints the code point (or end of file reference) at a given location in a * source for use in error messages. @@ -140,9 +173,12 @@ function printCodePointAt(lexer: Lexer, location: number): string { return code === 0x0022 ? "'\"'" : `"${body[location]}"`; } // Unicode code point + const point = isSupplementaryCodePoint(body, location) + ? decodeSurrogatePair(code, body.charCodeAt(location + 1)) + : code; const zeroPad = - code > 0xfff ? '' : code > 0xff ? '0' : code > 0xf ? '00' : '000'; - return `U+${zeroPad}${code.toString(16).toUpperCase()}`; + point > 0xfff ? '' : point > 0xff ? '0' : point > 0xf ? '00' : '000'; + return `U+${zeroPad}${point.toString(16).toUpperCase()}`; } /** @@ -286,7 +322,7 @@ function readNextToken(lexer: Lexer, start: number): Token { position, code === 0x0027 ? 'Unexpected single quote character (\'), did you mean to use a double quote (")?' - : isSourceCharacter(code) + : isUnicodeScalarValue(code) || isSupplementaryCodePoint(body, position) ? `Unexpected character: ${printCodePointAt(lexer, position)}.` : `Invalid character: ${printCodePointAt(lexer, position)}.`, ); @@ -318,8 +354,10 @@ function readComment(lexer: Lexer, start: number): Token { } // SourceCharacter - if (isSourceCharacter(code)) { + if (isUnicodeScalarValue(code)) { ++position; + } else if (isSupplementaryCodePoint(body, position)) { + position += 2; } else { break; } @@ -474,7 +512,9 @@ function readDigits(lexer: Lexer, start: number, firstCode: number): number { * - `\u` EscapedUnicode * - `\` EscapedCharacter * - * EscapedUnicode :: /[0-9A-Fa-f]{4}/ + * EscapedUnicode :: + * - `{` HexDigit+ `}` + * - HexDigit HexDigit HexDigit HexDigit * * EscapedCharacter :: one of `"` `\` `/` `b` `f` `n` `r` `t` * ``` @@ -500,7 +540,9 @@ function readString(lexer: Lexer, start: number): Token { value += body.slice(chunkStart, position); const escape = body.charCodeAt(position + 1) === 0x0075 // u - ? readEscapedUnicode(lexer, position) + ? body.charCodeAt(position + 2) === 0x007b // { + ? readEscapedUnicodeVariableWidth(lexer, position) + : readEscapedUnicodeFixedWidth(lexer, position) : readEscapedCharacter(lexer, position); value += escape.value; position += escape.size; @@ -514,8 +556,10 @@ function readString(lexer: Lexer, start: number): Token { } // SourceCharacter - if (isSourceCharacter(code)) { + if (isUnicodeScalarValue(code)) { ++position; + } else if (isSupplementaryCodePoint(body, position)) { + position += 2; } else { throw syntaxError( lexer.source, @@ -537,14 +581,81 @@ interface EscapeSequence { size: number; } -function readEscapedUnicode(lexer: Lexer, position: number): EscapeSequence { +function readEscapedUnicodeVariableWidth( + lexer: Lexer, + position: number, +): EscapeSequence { + const body = lexer.source.body; + let point = 0; + let size = 3; + // Cannot be larger than 12 chars (\u{00000000}). + while (size < 12) { + const code = body.charCodeAt(position + size++); + // Closing Brace (}) + if (code === 0x007d) { + // Must be at least 5 chars (\u{0}) and encode a Unicode scalar value. + if (size < 5 || !isUnicodeScalarValue(point)) { + break; + } + // JavaScript defines strings as a sequence of UTF-16 code units and + // encodes Unicode code points above U+FFFF using a surrogate pair. + return { + value: + point <= 0xffff + ? String.fromCharCode(point) + : encodeSurrogatePair(point), + size, + }; + } + // Append this hex digit to the code point. + point = (point << 4) | readHexDigit(code); + if (point < 0) { + break; + } + } + + throw syntaxError( + lexer.source, + position, + `Invalid Unicode escape sequence: "${body.slice( + position, + position + size, + )}".`, + ); +} + +function readEscapedUnicodeFixedWidth( + lexer: Lexer, + position: number, +): EscapeSequence { const body = lexer.source.body; const code = read16BitHexCode(body, position + 2); - if (code >= 0) { + if (isUnicodeScalarValue(code)) { return { value: String.fromCharCode(code), size: 6 }; } + // GraphQL allows JSON-style surrogate pair escape sequences, but only when + // a valid pair is formed. + if (isLeadingSurrogate(code)) { + // \u + if ( + body.charCodeAt(position + 6) === 0x005c && + body.charCodeAt(position + 7) === 0x0075 + ) { + const trailingCode = read16BitHexCode(body, position + 8); + if (isTrailingSurrogate(trailingCode)) { + // JavaScript defines strings as a sequence of UTF-16 code units and + // encodes Unicode code points above U+FFFF using a surrogate pair of + // code units. Since this is a surrogate pair escape sequence, just + // include both codes into the JavaScript string value. Had JavaScript + // not been internally based on UTF-16, then this surrogate pair would + // be decoded to retrieve the supplementary code point. + return { value: String.fromCharCode(code, trailingCode), size: 12 }; + } + } + } + throw syntaxError( lexer.source, position, @@ -578,6 +689,11 @@ function read16BitHexCode(body: string, position: number): number { * 'a' becomes 10, 'f' becomes 15 * * Returns -1 if the provided character code was not a valid hexadecimal digit. + * + * HexDigit :: one of + * - `0` `1` `2` `3` `4` `5` `6` `7` `8` `9` + * - `A` `B` `C` `D` `E` `F` + * - `a` `b` `c` `d` `e` `f` */ function readHexDigit(code: number): number { return code >= 0x0030 && code <= 0x0039 // 0-9 @@ -696,8 +812,10 @@ function readBlockString(lexer: Lexer, start: number): Token { } // SourceCharacter - if (isSourceCharacter(code)) { + if (isUnicodeScalarValue(code)) { ++position; + } else if (isSupplementaryCodePoint(body, position)) { + position += 2; } else { throw syntaxError( lexer.source, diff --git a/src/language/printString.ts b/src/language/printString.ts new file mode 100644 index 0000000000..b091bcc2c1 --- /dev/null +++ b/src/language/printString.ts @@ -0,0 +1,38 @@ +/** + * Prints a string as a GraphQL StringValue literal. Replaces control characters + * and excluded characters (" U+0022 and \\ U+005C) with escape sequences. + */ +export function printString(str: string): string { + return `"${str.replace(escapedRegExp, escapedReplacer)}"`; +} + +// eslint-disable-next-line no-control-regex +const escapedRegExp = /[\x00-\x1f\x22\x5c\x7f-\x9f]/g; + +function escapedReplacer(str: string): string { + return escapeSequences[str.charCodeAt(0)]; +} + +// prettier-ignore +const escapeSequences = [ + '\\u0000', '\\u0001', '\\u0002', '\\u0003', '\\u0004', '\\u0005', '\\u0006', '\\u0007', + '\\b', '\\t', '\\n', '\\u000B', '\\f', '\\r', '\\u000E', '\\u000F', + '\\u0010', '\\u0011', '\\u0012', '\\u0013', '\\u0014', '\\u0015', '\\u0016', '\\u0017', + '\\u0018', '\\u0019', '\\u001A', '\\u001B', '\\u001C', '\\u001D', '\\u001E', '\\u001F', + '', '', '\\"', '', '', '', '', '', + '', '', '', '', '', '', '', '', // 2F + '', '', '', '', '', '', '', '', + '', '', '', '', '', '', '', '', // 3F + '', '', '', '', '', '', '', '', + '', '', '', '', '', '', '', '', // 4F + '', '', '', '', '', '', '', '', + '', '', '', '', '\\\\', '', '', '', // 5F + '', '', '', '', '', '', '', '', + '', '', '', '', '', '', '', '', // 6F + '', '', '', '', '', '', '', '', + '', '', '', '', '', '', '', '\\u007F', + '\\u0080', '\\u0081', '\\u0082', '\\u0083', '\\u0084', '\\u0085', '\\u0086', '\\u0087', + '\\u0088', '\\u0089', '\\u008A', '\\u008B', '\\u008C', '\\u008D', '\\u008E', '\\u008F', + '\\u0090', '\\u0091', '\\u0092', '\\u0093', '\\u0094', '\\u0095', '\\u0096', '\\u0097', + '\\u0098', '\\u0099', '\\u009A', '\\u009B', '\\u009C', '\\u009D', '\\u009E', '\\u009F', +]; diff --git a/src/language/printer.ts b/src/language/printer.ts index 2134ea1ee6..b6df7d80f9 100644 --- a/src/language/printer.ts +++ b/src/language/printer.ts @@ -4,6 +4,7 @@ import type { ASTNode } from './ast'; import type { ASTReducer } from './visitor'; import { visit } from './visitor'; import { printBlockString } from './blockString'; +import { printString } from './printString'; /** * Converts an AST into a string, using one set of reasonable @@ -109,7 +110,7 @@ const printDocASTReducer: ASTReducer = { FloatValue: { leave: ({ value }) => value }, StringValue: { leave: ({ value, block: isBlockString }) => - isBlockString ? printBlockString(value) : JSON.stringify(value), + isBlockString ? printBlockString(value) : printString(value), }, BooleanValue: { leave: ({ value }) => (value ? 'true' : 'false') }, NullValue: { leave: () => 'null' }, diff --git a/src/type/__tests__/introspection-test.ts b/src/type/__tests__/introspection-test.ts index 0a480c3e71..4d5f1398d1 100644 --- a/src/type/__tests__/introspection-test.ts +++ b/src/type/__tests__/introspection-test.ts @@ -1070,6 +1070,52 @@ describe('Introspection', () => { }); }); + it('introspects any default value', () => { + const schema = buildSchema(` + input InputObjectWithDefaultValues { + a: String = "Emoji: \\u{1F600}" + b: Complex = {x: ["abc"], y: 123} + } + + input Complex { + x: [String] + y: Int + } + + type Query { + someField(someArg: InputObjectWithDefaultValues): String + } + `); + + const source = ` + { + __type(name: "InputObjectWithDefaultValues") { + inputFields { + name + defaultValue + } + } + } + `; + + expect(graphqlSync({ schema, source })).to.deep.equal({ + data: { + __type: { + inputFields: [ + { + name: 'a', + defaultValue: '"Emoji: \u{1F600}"', + }, + { + name: 'b', + defaultValue: '{x: ["abc"], y: 123}', + }, + ], + }, + }, + }); + }); + it('supports the __type root field', () => { const schema = buildSchema(` type Query {