Skip to content

Commit

Permalink
RFC: Support full Unicode in lexer
Browse files Browse the repository at this point in the history
Depends on #3115

Implements RFC at graphql/graphql-spec#849.

* Replaces `isSourceCharacter` with `isUnicodeScalarValue`
* Adds `isSupplementaryCodePoint`, used in String, BlockStrings, and Comments to ensure correct lexing of JavaScript's UTF-16 source.
* Updates `printCodePointAt` to correctly print supplementary code points.
* Adds variable-width Unicode escape sequences
* Adds explicit support for legacy JSON-style fixed-width Unicode escape sequence surrogate pairs.
* Adds `printString` to no longer rely on `JSON.stringify`. Borrows some implementation details from Node.js internals for string printing.

  Implements:

  > When producing a {StringValue}, implementations should use escape sequences to
  > represent non-printable control characters (U+0000 to U+001F and U+007F to
  > U+009F). Other escape sequences are not necessary, however an implementation may
  > use escape sequences to represent any other range of code points.

Closes #2449

Co-authored-by: Andreas Marek <andimarek@fastmail.fm>
  • Loading branch information
leebyron and andimarek committed May 19, 2021
1 parent 2cd8510 commit 601b4c2
Show file tree
Hide file tree
Showing 5 changed files with 453 additions and 47 deletions.
255 changes: 224 additions & 31 deletions src/language/__tests__/lexer-test.js
Expand Up @@ -27,13 +27,6 @@ function expectSyntaxError(text: string) {
}

describe('Lexer', () => {
it('disallows uncommon control characters', () => {
expectSyntaxError('\u0007').to.deep.equal({
message: 'Syntax Error: Invalid character: U+0007.',
locations: [{ line: 1, column: 1 }],
});
});

it('ignores BOM header', () => {
expect(lexOne('\uFEFF foo')).to.contain({
kind: TokenKind.NAME,
Expand Down Expand Up @@ -263,12 +256,98 @@ describe('Lexer', () => {
value: 'slashes \\ /',
});

expect(lexOne('"unescaped unicode outside BMP \u{1f600}"')).to.contain({
kind: TokenKind.STRING,
start: 0,
end: 34,
value: 'unescaped unicode outside BMP \u{1f600}',
});

expect(
lexOne('"unescaped maximal unicode outside BMP \u{10ffff}"'),
).to.contain({
kind: TokenKind.STRING,
start: 0,
end: 42,
value: 'unescaped maximal unicode outside BMP \u{10ffff}',
});

expect(lexOne('"unicode \\u1234\\u5678\\u90AB\\uCDEF"')).to.contain({
kind: TokenKind.STRING,
start: 0,
end: 34,
value: 'unicode \u1234\u5678\u90AB\uCDEF',
});

expect(lexOne('"unicode \\u{1234}\\u{5678}\\u{90AB}\\u{CDEF}"')).to.contain(
{
kind: TokenKind.STRING,
start: 0,
end: 42,
value: 'unicode \u1234\u5678\u90AB\uCDEF',
},
);

expect(
lexOne('"string with unicode escape outside BMP \\u{1F600}"'),
).to.contain({
kind: TokenKind.STRING,
start: 0,
end: 50,
value: 'string with unicode escape outside BMP \u{1f600}',
});

expect(lexOne('"string with minimal unicode escape \\u{0}"')).to.contain({
kind: TokenKind.STRING,
start: 0,
end: 42,
value: 'string with minimal unicode escape \u{0}',
});

expect(
lexOne('"string with maximal unicode escape \\u{10FFFF}"'),
).to.contain({
kind: TokenKind.STRING,
start: 0,
end: 47,
value: 'string with maximal unicode escape \u{10FFFF}',
});

expect(
lexOne('"string with maximal minimal unicode escape \\u{00000000}"'),
).to.contain({
kind: TokenKind.STRING,
start: 0,
end: 57,
value: 'string with maximal minimal unicode escape \u{0}',
});

expect(
lexOne('"string with unicode surrogate pair escape \\uD83D\\uDE00"'),
).to.contain({
kind: TokenKind.STRING,
start: 0,
end: 56,
value: 'string with unicode surrogate pair escape \u{1f600}',
});

expect(
lexOne('"string with minimal surrogate pair escape \\uD800\\uDC00"'),
).to.contain({
kind: TokenKind.STRING,
start: 0,
end: 56,
value: 'string with minimal surrogate pair escape \u{10000}',
});

expect(
lexOne('"string with maximal surrogate pair escape \\uDBFF\\uDFFF"'),
).to.contain({
kind: TokenKind.STRING,
start: 0,
end: 56,
value: 'string with maximal surrogate pair escape \u{10FFFF}',
});
});

it('lex reports useful string errors', () => {
Expand Down Expand Up @@ -298,16 +377,19 @@ describe('Lexer', () => {
locations: [{ line: 1, column: 1 }],
});

expectSyntaxError('"contains unescaped \u0007 control char"').to.deep.equal(
{
message: 'Syntax Error: Invalid character within String: U+0007.',
locations: [{ line: 1, column: 21 }],
},
);
expectSyntaxError('"bad surrogate \uDEAD"').to.deep.equal({
message: 'Syntax Error: Invalid character within String: U+DEAD.',
locations: [{ line: 1, column: 16 }],
});

expectSyntaxError('"bad high surrogate pair \uDEAD\uDEAD"').to.deep.equal({
message: 'Syntax Error: Invalid character within String: U+DEAD.',
locations: [{ line: 1, column: 26 }],
});

expectSyntaxError('"null-byte is not \u0000 end of file"').to.deep.equal({
message: 'Syntax Error: Invalid character within String: U+0000.',
locations: [{ line: 1, column: 19 }],
expectSyntaxError('"bad low surrogate pair \uD800\uD800"').to.deep.equal({
message: 'Syntax Error: Invalid character within String: U+D800.',
locations: [{ line: 1, column: 25 }],
});

expectSyntaxError('"multi\nline"').to.deep.equal({
Expand Down Expand Up @@ -354,6 +436,93 @@ describe('Lexer', () => {
message: 'Syntax Error: Invalid Unicode escape sequence: "\\uXXXF".',
locations: [{ line: 1, column: 6 }],
});

expectSyntaxError('"bad \\u{} esc"').to.deep.equal({
message: 'Syntax Error: Invalid Unicode escape sequence: "\\u{}".',
locations: [{ line: 1, column: 6 }],
});

expectSyntaxError('"bad \\u{FXXX} esc"').to.deep.equal({
message: 'Syntax Error: Invalid Unicode escape sequence: "\\u{FX".',
locations: [{ line: 1, column: 6 }],
});

expectSyntaxError('"bad \\u{FFFF esc"').to.deep.equal({
message: 'Syntax Error: Invalid Unicode escape sequence: "\\u{FFFF ".',
locations: [{ line: 1, column: 6 }],
});

expectSyntaxError('"bad \\u{FFFF"').to.deep.equal({
message: 'Syntax Error: Invalid Unicode escape sequence: "\\u{FFFF"".',
locations: [{ line: 1, column: 6 }],
});

expectSyntaxError('"too high \\u{110000} esc"').to.deep.equal({
message: 'Syntax Error: Invalid Unicode escape sequence: "\\u{110000}".',
locations: [{ line: 1, column: 11 }],
});

expectSyntaxError('"way too high \\u{12345678} esc"').to.deep.equal({
message:
'Syntax Error: Invalid Unicode escape sequence: "\\u{12345678}".',
locations: [{ line: 1, column: 15 }],
});

expectSyntaxError('"too long \\u{000000000} esc"').to.deep.equal({
message:
'Syntax Error: Invalid Unicode escape sequence: "\\u{000000000".',
locations: [{ line: 1, column: 11 }],
});

expectSyntaxError('"bad surrogate \\uDEAD esc"').to.deep.equal({
message: 'Syntax Error: Invalid Unicode escape sequence: "\\uDEAD".',
locations: [{ line: 1, column: 16 }],
});

expectSyntaxError('"bad surrogate \\u{DEAD} esc"').to.deep.equal({
message: 'Syntax Error: Invalid Unicode escape sequence: "\\u{DEAD}".',
locations: [{ line: 1, column: 16 }],
});

expectSyntaxError(
'"cannot use braces for surrogate pair \\u{D83D}\\u{DE00} esc"',
).to.deep.equal({
message: 'Syntax Error: Invalid Unicode escape sequence: "\\u{D83D}".',
locations: [{ line: 1, column: 39 }],
});

expectSyntaxError(
'"bad high surrogate pair \\uDEAD\\uDEAD esc"',
).to.deep.equal({
message: 'Syntax Error: Invalid Unicode escape sequence: "\\uDEAD".',
locations: [{ line: 1, column: 26 }],
});

expectSyntaxError(
'"bad low surrogate pair \\uD800\\uD800 esc"',
).to.deep.equal({
message: 'Syntax Error: Invalid Unicode escape sequence: "\\uD800".',
locations: [{ line: 1, column: 25 }],
});

expectSyntaxError(
'"cannot escape half a pair \uD83D\\uDE00 esc"',
).to.deep.equal({
message: 'Syntax Error: Invalid character within String: U+D83D.',
locations: [{ line: 1, column: 28 }],
});

expectSyntaxError(
'"cannot escape half a pair \\uD83D\uDE00 esc"',
).to.deep.equal({
message: 'Syntax Error: Invalid Unicode escape sequence: "\\uD83D".',
locations: [{ line: 1, column: 28 }],
});

expectSyntaxError('"bad \\uD83D\\not an escape"').to.deep.equal({
message: 'Syntax Error: Invalid Unicode escape sequence: "\\uD83D".',
locations: [{ line: 1, column: 6 }],
});
});

it('lexes block strings', () => {
Expand Down Expand Up @@ -413,6 +582,13 @@ describe('Lexer', () => {
value: 'unescaped \\n\\r\\b\\t\\f\\u1234',
});

expect(lexOne('"""unescaped unicode outside BMP \u{1f600}"""')).to.contain({
kind: TokenKind.BLOCK_STRING,
start: 0,
end: 38,
value: 'unescaped unicode outside BMP \u{1f600}',
});

expect(lexOne('"""slashes \\\\ \\/"""')).to.contain({
kind: TokenKind.BLOCK_STRING,
start: 0,
Expand Down Expand Up @@ -485,18 +661,9 @@ describe('Lexer', () => {
locations: [{ line: 1, column: 16 }],
});

expectSyntaxError(
'"""contains unescaped \u0007 control char"""',
).to.deep.equal({
message: 'Syntax Error: Invalid character within String: U+0007.',
locations: [{ line: 1, column: 23 }],
});

expectSyntaxError(
'"""null-byte is not \u0000 end of file"""',
).to.deep.equal({
message: 'Syntax Error: Invalid character within String: U+0000.',
locations: [{ line: 1, column: 21 }],
expectSyntaxError('"""contains invalid surrogate \uDEAD"""').to.deep.equal({
message: 'Syntax Error: Invalid character within String: U+DEAD.',
locations: [{ line: 1, column: 31 }],
});
});

Expand Down Expand Up @@ -836,10 +1003,30 @@ describe('Lexer', () => {
locations: [{ line: 1, column: 1 }],
});

expectSyntaxError('\x00').to.deep.equal({
message: 'Syntax Error: Unexpected character: U+0000.',
locations: [{ line: 1, column: 1 }],
});

expectSyntaxError('\b').to.deep.equal({
message: 'Syntax Error: Unexpected character: U+0008.',
locations: [{ line: 1, column: 1 }],
});

expectSyntaxError('\u203B').to.deep.equal({
message: 'Syntax Error: Unexpected character: U+203B.',
locations: [{ line: 1, column: 1 }],
});

expectSyntaxError('\u{1f600}').to.deep.equal({
message: 'Syntax Error: Unexpected character: U+1F600.',
locations: [{ line: 1, column: 1 }],
});

expectSyntaxError('\uDEAD').to.deep.equal({
message: 'Syntax Error: Invalid character: U+DEAD.',
locations: [{ line: 1, column: 1 }],
});
});

it('lex reports useful information for dashes in names', () => {
Expand Down Expand Up @@ -920,9 +1107,15 @@ describe('Lexer', () => {
end: 9,
value: ' Comment',
});
expectSyntaxError('# \u0007').to.deep.equal({
message: 'Syntax Error: Invalid character: U+0007.',
locations: [{ line: 1, column: 3 }],
expect(lexOne('# Comment \u{1f600}').prev).to.contain({
kind: TokenKind.COMMENT,
start: 0,
end: 12,
value: ' Comment \u{1f600}',
});
expectSyntaxError('# Invalid surrogate \uDEAD').to.deep.equal({
message: 'Syntax Error: Invalid character: U+DEAD.',
locations: [{ line: 1, column: 21 }],
});
});
});
Expand Down
54 changes: 54 additions & 0 deletions src/language/__tests__/printString-test.js
@@ -0,0 +1,54 @@
import { expect } from 'chai';
import { describe, it } from 'mocha';

import { printString } from '../printString';

describe('printString', () => {
it('prints a simple string', () => {
expect(printString('hello world')).to.equal('"hello world"');
});

it('escapes quotes', () => {
expect(printString('"hello world"')).to.equal('"\\"hello world\\""');
});

it('does not escape single quote', () => {
expect(printString("who's test")).to.equal('"who\'s test"');
});

it('escapes backslashes', () => {
expect(printString('escape: \\n')).to.equal('"escape: \\\\n"');
});

it('escapes well-known control chars', () => {
expect(printString('\b\f\n\r\t')).to.equal('"\\b\\f\\n\\r\\t"');
});

it('escapes zero byte', () => {
expect(printString('\x00')).to.equal('"\\u0000"');
});

it('does not escape space', () => {
expect(printString(' ')).to.equal('" "');
});

it('escapes all other control chars', () => {
for (let i = 1; i <= 0x9f; i++) {
const source = String.fromCharCode(i);
if (/[\b\f\n\r\t]/.test(source) || (i >= 0x0020 && i <= 0x007e)) {
continue;
}
expect(printString(source)).to.equal(
`"\\u00${i <= 0x000f ? '0' : ''}${i.toString(16).toUpperCase()}"`,
);
}
});

it('does not escape non-ascii character', () => {
expect(printString('\u21BB')).to.equal('"\u21BB"');
});

it('does not escape supplementary character', () => {
expect(printString('\u{1f600}')).to.equal('"\u{1f600}"');
});
});

0 comments on commit 601b4c2

Please sign in to comment.