Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support full unicode in parser #2404

Merged
merged 16 commits into from Jul 14, 2021
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/main/antlr/GraphqlCommon.g4
Expand Up @@ -120,12 +120,12 @@ fragment BlockStringCharacter:
ExtendedSourceCharacter;

fragment StringCharacter:
([\u0009\u0020\u0021] | [\u0023-\u005b] | [\u005d-\u{10FFFF}]) | // this is SoureCharacter without '"' and '\'
([\u0009\u0020\u0021] | [\u0023-\u005b] | [\u005d-\u{10FFFF}]) | // this is SourceCharacter without '"' and '\'
dondonz marked this conversation as resolved.
Show resolved Hide resolved
'\\u' EscapedUnicode |
'\\' EscapedCharacter;

fragment EscapedCharacter : ["\\/bfnrt];
fragment EscapedUnicode : Hex Hex Hex Hex;
fragment EscapedUnicode : Hex Hex Hex Hex | '{' Hex+ '}';
fragment Hex : [0-9a-fA-F];


Expand Down
13 changes: 7 additions & 6 deletions src/main/java/graphql/parser/StringValueParsing.java
Expand Up @@ -30,7 +30,9 @@ public static String removeIndentation(String rawValue) {
String[] lines = rawValue.split("\\n");
Integer commonIndent = null;
for (int i = 0; i < lines.length; i++) {
if (i == 0) continue;
if (i == 0) {
continue;
}
String line = lines[i];
int length = line.length();
int indent = leadingWhitespace(line);
Expand All @@ -44,7 +46,9 @@ public static String removeIndentation(String rawValue) {
if (commonIndent != null) {
for (int i = 0; i < lineList.size(); i++) {
String line = lineList.get(i);
if (i == 0) continue;
if (i == 0) {
continue;
}
if (line.length() > commonIndent) {
line = line.substring(commonIndent);
lineList.set(i, line);
Expand Down Expand Up @@ -135,10 +139,7 @@ public static String parseSingleQuotedString(String string) {
writer.write('\t');
continue;
case 'u':
String hexStr = string.substring(i + 1, i + 5);
int codepoint = Integer.parseInt(hexStr, 16);
i += 4;
writer.write(codepoint);
i = UnicodeUtil.parseAndWriteUnicode(writer, string, i);
continue;
default:
Assert.assertShouldNeverHappen();
Expand Down
64 changes: 64 additions & 0 deletions src/main/java/graphql/parser/UnicodeUtil.java
@@ -0,0 +1,64 @@
package graphql.parser;

import graphql.Assert;
import graphql.Internal;

import java.io.IOException;
import java.io.StringWriter;

/**
* Contains Unicode helpers for parsing StringValue types in the grammar
*/
@Internal
public class UnicodeUtil {
public static int MAX_UNICODE_CODE_POINT = 0x10FFFF;

public static int parseAndWriteUnicode(StringWriter writer, String string, int i) {
// Unicode characters can either be:
// - four hex characters in the form \\u597D, or
// - any number of hex characters surrounded by a brace in the form \\u{1F37A}

// Four hex character only case \\u597D, for code points in the Basic Multilingual Plane (BMP)
if (isNotBracedEscape(string, i)) {
String hexStr = string.substring(i + 1, i + 5);
int codepoint = Integer.parseInt(hexStr, 16);
writer.write(codepoint);
return i + 4;
// TODO error checking of invalid values
} else {
// Any number of hex characters e.g. \\u{1F37A}, which allows code points outside the Basic Multilingual Plane (BMP)
int startIx = i + 2;
int endIndexExclusive = startIx;
do {
if (endIndexExclusive + 1 >= string.length()) {
throw new RuntimeException("invalid unicode encoding");
dondonz marked this conversation as resolved.
Show resolved Hide resolved
}
} while (string.charAt(++endIndexExclusive) != '}');

String hexStr = string.substring(startIx, endIndexExclusive);
Integer hexValue = Integer.parseInt(hexStr, 16);
if (isValidUnicodeCodePoint(hexValue)) {
char[] chars = Character.toChars(hexValue);
try {
writer.write(chars);
} catch (IOException e) {
return Assert.assertShouldNeverHappen();
}
return endIndexExclusive;
} else {
throw new RuntimeException("invalid unicode code point");
dondonz marked this conversation as resolved.
Show resolved Hide resolved
}
}
// Assert.assertShouldNeverHappen();
// TODO error checking of invalid values
dondonz marked this conversation as resolved.
Show resolved Hide resolved
}

private static boolean isNotBracedEscape(String string, int i) {
return string.charAt(i + 1) != '{';
}

private static boolean isValidUnicodeCodePoint(Integer value) {
// TODO: Add bad surrogate checks
return value <= MAX_UNICODE_CODE_POINT;
}
}
10 changes: 4 additions & 6 deletions src/test/groovy/graphql/parser/StringValueParsingTest.groovy
Expand Up @@ -40,8 +40,7 @@ class StringValueParsingTest extends Specification {
parsed == '''"'''
}

def "parsing emoji should work"() {
// needs surrogate pairs for this emoji
def "parsing beer stein as surrogate pair should work"() {
given:
def input = '''"\\ud83c\\udf7a"'''

Expand All @@ -52,18 +51,17 @@ class StringValueParsingTest extends Specification {
parsed == '''🍺''' // contains the beer icon U+1F37A : http://www.charbase.com/1f37a-unicode-beer-mug
}

def "parsing simple unicode should work"() {
def "parsing simple unicode should work - Basic Multilingual Plane (BMP)"() {
given:
def input = '''"\\u56fe"'''
def input = '''"\\u5564\\u9152"'''

when:
String parsed = StringValueParsing.parseSingleQuotedString(input)

then:
parsed == ''''''
parsed == '''啤酒'''
}


def "parsing triple quoted string should work"() {
given:
def input = '''"""triple quoted"""'''
Expand Down
199 changes: 199 additions & 0 deletions src/test/groovy/graphql/parser/UnicodeUtilParserTest.groovy
@@ -0,0 +1,199 @@
package graphql.parser

import graphql.language.Document
import graphql.language.Field
import graphql.language.OperationDefinition
import graphql.language.StringValue
import graphql.schema.validation.InvalidSchemaException
import spock.lang.Ignore
import spock.lang.Specification

class UnicodeUtilParserTest extends Specification {
/*
Implements RFC to support full Unicode
Original RFC https://github.com/graphql/graphql-spec/issues/687
RFC spec text https://github.com/graphql/graphql-spec/pull/849
RFC JS implementation https://github.com/graphql/graphql-js/pull/3117

TL;DR
Previously, valid SourceCharacters included Unicode scalar values up to and including U+FFFF - the Basic Multilingual Plane (BMP)
Now this is changing to incorporate all Unicode scalar values
Assert {value} is a within the *Unicode scalar value* range (>= 0x0000 and <= 0xD7FF or >= 0xE000 and <= 0x10FFFF).
Practically this means you can have your beer emoji (U+1F37A) in queries as \\u{1F37A}
*/

// With this RFC, code points outside the Basic Multilingual Plane can be parsed. For example, emojis
// Previously emojis could only be parsed with surrogate pairs. Now they can be parsed with the code point directly
def "parsing beer stein as escaped unicode"() {
given:
def input = '''"\\u{1F37A} hello"'''

when:
String parsed = StringValueParsing.parseSingleQuotedString(input)

then:
parsed == '''🍺 hello''' // contains the beer icon U+1F37A : http://www.charbase.com/1f37a-unicode-beer-mug
}

def "parsing beer mug non escaped"() {
given:
def input = '''"🍺 hello"'''

when:
String parsed = StringValueParsing.parseSingleQuotedString(input)

then:
parsed == '''🍺 hello''' // contains the beer icon U+1F37A : http://www.charbase.com/1f37a-unicode-beer-mug
}

def "allow braced escaped unicode"() {
def input = '''
{
foo(arg: "\\u{1F37A}")
}
'''

when:
Document document = Parser.parse(input)
OperationDefinition operationDefinition = (document.definitions[0] as OperationDefinition)
def field = operationDefinition.getSelectionSet().getSelections()[0] as Field
def argValue = field.arguments[0].value as StringValue

then:
argValue.getValue() == "🍺"
}

/*
From the RFC:
For legacy reasons, a *supplementary character* may be escaped by two
fixed-width unicode escape sequences forming a *surrogate pair*. For example
the input `"\\uD83D\\uDCA9"` is a valid {StringValue} which represents the same
Unicode text as `"\\u{1F4A9}"`. While this legacy form is allowed, it should be
avoided as a variable-width unicode escape sequence is a clearer way to encode
such code points.
*/
def "allow surrogate pairs escaped unicode"() {
def input = '''
{
foo(arg: "\\ud83c\\udf7a")
}
'''

when:
Document document = Parser.parse(input)
OperationDefinition operationDefinition = (document.definitions[0] as OperationDefinition)
def field = operationDefinition.getSelectionSet().getSelections()[0] as Field
def argValue = field.arguments[0].value as StringValue

then:
argValue.getValue() == "🍺"
}

/*
From the RFC:
* If {leadingValue} is >= 0xD800 and <= 0xDBFF (a *Leading Surrogate*):
* Assert {trailingValue} is >= 0xDC00 and <= 0xDFFF (a *Trailing Surrogate*).
* Return ({leadingValue} - 0xD800) × 0x400 + ({trailingValue} - 0xDC00) + 0x10000.
*/
@Ignore
def "invalid surrogate pair"() {
def input = '''
{
foo(arg: "\\uD83D\\uDBFF")
}
'''

when:
Document document = Parser.parse(input)

then:
// TODO: Raise exception
dondonz marked this conversation as resolved.
Show resolved Hide resolved
false
}

def "invalid unicode code point"() {
def input = '''
{
foo(arg: "\\u{fffffff}")
}
'''

when:
Document document = Parser.parse(input)

then:
Exception e = thrown(Exception)
e.message == "invalid unicode code point"
}

@Ignore
def "invalid unpaired surrogate" () {
def input = '''
{
foo(arg: "\\uD83D")
}
'''

when:
Document document = Parser.parse(input)

then:
// TODO: Discuss whether to raise exception
dondonz marked this conversation as resolved.
Show resolved Hide resolved
false
}

@Ignore
def "invalid code point - too long" () {
given:
def input = '''"\\u{000000000}"'''

when:
String parsed = StringValueParsing.parseSingleQuotedString(input)

then:
// TODO: Discuss whether to raise exception. How do we want to treat leading zeroes?
false
}

/*
From the RFC
**Byte order mark**

UnicodeBOM :: "Byte Order Mark (U+FEFF)"

The *Byte Order Mark* is a special Unicode code point which may appear at the
beginning of a file which programs may use to determine the fact that the text
stream is Unicode, and what specific encoding has been used.

As files are often concatenated, a *Byte Order Mark* may appear anywhere within
a GraphQL document and is {Ignored}.
*/
@Ignore
// TODO: BOM was previously implemented. Do we want to change the prior implementation?
def "byte order mark to be ignored" () {
// The Byte Order Mark indicates a Unicode stream, and whether the stream is high-endian or low-endian
given:
def input = '''"hello \\uFEFF\\u4F60\\u597D"'''

when:
String parsed = StringValueParsing.parseSingleQuotedString(input)

then:
parsed == '''hello 你好'''
}

// TODO: How do we want to handle control characters?
@Ignore
def "escapes zero byte" () {
// TODO: This is a test case from the JS implementation. Do we want to implement this case?
given:
def input = '''"\\x00"'''

when:
String parsed = StringValueParsing.parseSingleQuotedString(input)

then:
parsed == '''\\u0000'''
}
}