Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support full unicode in parser #2404

Merged
merged 16 commits into from Jul 14, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
28 changes: 13 additions & 15 deletions src/main/antlr/GraphqlCommon.g4
Expand Up @@ -117,31 +117,29 @@ StringValue:

fragment BlockStringCharacter:
'\\"""'|
ExtendedSourceCharacter;
SourceCharacter;

// this is SourceCharacter without
// \u000a New line
// \u000d Carriage return
// \u0022 '"'
// \u005c '\'
fragment StringCharacter:
([\u0009\u0020\u0021] | [\u0023-\u005b] | [\u005d-\u{10FFFF}]) | // this is SoureCharacter without '"' and '\'
([\u0000-\u0009] | [\u000b\u000c\u000e-\u0021] | [\u0023-\u005b] | [\u005d-\ud7ff] | [\ue000-\u{10ffff}]) |
'\\u' EscapedUnicode |
'\\' EscapedCharacter;

fragment EscapedCharacter : ["\\/bfnrt];
fragment EscapedUnicode : Hex Hex Hex Hex;
fragment EscapedUnicode : Hex Hex Hex Hex | '{' Hex+ '}';
fragment Hex : [0-9a-fA-F];

// this is the spec definition. Excludes surrogate leading and trailing values.
fragment SourceCharacter : [\u0000-\ud7ff] | [\ue000-\u{10ffff}];

// this is currently not covered by the spec because we allow all unicode chars
// u0009 = \t Horizontal tab
// u000a = \n line feed
// u000d = \r carriage return
// u0020 = space
fragment ExtendedSourceCharacter :[\u0009\u000A\u000D\u0020-\u{10FFFF}];
fragment ExtendedSourceCharacterWithoutLineFeed :[\u0009\u0020-\u{10FFFF}];
// CommentChar
fragment SourceCharacterWithoutLineFeed : [\u0000-\u0009] | [\u000b\u000c\u000e-\ud7ff] | [\ue000-\u{10ffff}];

// this is the spec definition
// fragment SourceCharacter :[\u0009\u000A\u000D\u0020-\uFFFF];


Comment: '#' ExtendedSourceCharacterWithoutLineFeed* -> channel(2);
Comment: '#' SourceCharacterWithoutLineFeed* -> channel(2);

LF: [\n] -> channel(3);
CR: [\r] -> channel(3);
Expand Down
4 changes: 4 additions & 0 deletions src/main/java/graphql/parser/AntlrHelper.java
Expand Up @@ -3,6 +3,7 @@
import graphql.Internal;
import graphql.language.SourceLocation;
import org.antlr.v4.runtime.Token;
import org.antlr.v4.runtime.tree.TerminalNode;

import java.util.List;

Expand All @@ -28,6 +29,9 @@ public static SourceLocation createSourceLocation(MultiSourceReader multiSourceR
return AntlrHelper.createSourceLocation(multiSourceReader, token.getLine(), token.getCharPositionInLine());
}

public static SourceLocation createSourceLocation(MultiSourceReader multiSourceReader, TerminalNode terminalNode) {
return AntlrHelper.createSourceLocation(multiSourceReader, terminalNode.getSymbol().getLine(), terminalNode.getSymbol().getCharPositionInLine());
}

/* grabs 3 lines before and after the syntax error */
public static String createPreview(MultiSourceReader multiSourceReader, int antrlLine) {
Expand Down
9 changes: 5 additions & 4 deletions src/main/java/graphql/parser/GraphqlAntlrToLanguage.java
Expand Up @@ -760,13 +760,14 @@ protected Value createValue(GraphqlParser.ValueContext ctx) {
return assertShouldNeverHappen();
}

static String quotedString(TerminalNode terminalNode) {
protected String quotedString(TerminalNode terminalNode) {
boolean multiLine = terminalNode.getText().startsWith("\"\"\"");
String strText = terminalNode.getText();
SourceLocation sourceLocation = AntlrHelper.createSourceLocation(multiSourceReader, terminalNode);
if (multiLine) {
return parseTripleQuotedString(strText);
} else {
return parseSingleQuotedString(strText);
return parseSingleQuotedString(strText, sourceLocation);
}
}

Expand Down Expand Up @@ -839,12 +840,12 @@ protected Description newDescription(GraphqlParser.DescriptionContext descriptio
}
String content = terminalNode.getText();
boolean multiLine = content.startsWith("\"\"\"");
SourceLocation sourceLocation = getSourceLocation(descriptionCtx);
if (multiLine) {
content = parseTripleQuotedString(content);
} else {
content = parseSingleQuotedString(content);
content = parseSingleQuotedString(content, sourceLocation);
}
SourceLocation sourceLocation = getSourceLocation(descriptionCtx);
return new Description(content, sourceLocation, multiLine);
}

Expand Down
20 changes: 13 additions & 7 deletions src/main/java/graphql/parser/StringValueParsing.java
Expand Up @@ -2,6 +2,7 @@

import graphql.Assert;
import graphql.Internal;
import graphql.language.SourceLocation;

import java.io.StringWriter;
import java.util.ArrayList;
Expand Down Expand Up @@ -30,7 +31,9 @@ public static String removeIndentation(String rawValue) {
String[] lines = rawValue.split("\\n");
Integer commonIndent = null;
for (int i = 0; i < lines.length; i++) {
if (i == 0) continue;
if (i == 0) {
continue;
}
String line = lines[i];
int length = line.length();
int indent = leadingWhitespace(line);
Expand All @@ -44,7 +47,9 @@ public static String removeIndentation(String rawValue) {
if (commonIndent != null) {
for (int i = 0; i < lineList.size(); i++) {
String line = lineList.get(i);
if (i == 0) continue;
if (i == 0) {
continue;
}
if (line.length() > commonIndent) {
line = line.substring(commonIndent);
lineList.set(i, line);
Expand Down Expand Up @@ -98,7 +103,7 @@ private static boolean containsOnlyWhiteSpace(String str) {
return leadingWhitespace(str) == str.length();
}

public static String parseSingleQuotedString(String string) {
public static String parseSingleQuotedString(String string, SourceLocation sourceLocation) {
StringWriter writer = new StringWriter(string.length() - 2);
int end = string.length() - 1;
for (int i = 1; i < end; i++) {
Expand Down Expand Up @@ -135,15 +140,16 @@ public static String parseSingleQuotedString(String string) {
writer.write('\t');
continue;
case 'u':
String hexStr = string.substring(i + 1, i + 5);
int codepoint = Integer.parseInt(hexStr, 16);
i += 4;
writer.write(codepoint);
i = UnicodeUtil.parseAndWriteUnicode(writer, string, i, sourceLocation);
continue;
default:
Assert.assertShouldNeverHappen();
}
}
return writer.toString();
}

public static String parseSingleQuotedString(String string) {
return parseSingleQuotedString(string, null);
}
}
114 changes: 114 additions & 0 deletions src/main/java/graphql/parser/UnicodeUtil.java
@@ -0,0 +1,114 @@
package graphql.parser;

import graphql.Internal;
import graphql.language.SourceLocation;

import java.io.IOException;
import java.io.StringWriter;

import static graphql.Assert.assertShouldNeverHappen;

/**
* Contains Unicode helpers for parsing StringValue types in the grammar
*/
@Internal
public class UnicodeUtil {
public static int MAX_UNICODE_CODE_POINT = 0x10FFFF;
public static int LEADING_SURROGATE_LOWER_BOUND = 0xD800;
public static int LEADING_SURROGATE_UPPER_BOUND = 0xDBFF;
public static int TRAILING_SURROGATE_LOWER_BOUND = 0xDC00;
public static int TRAILING_SURROGATE_UPPER_BOUND = 0xDFFF;

public static int parseAndWriteUnicode(StringWriter writer, String string, int i, SourceLocation sourceLocation) {
// Unicode code points can either be:
// 1. Unbraced: four hex characters in the form \\u597D, or
// 2. Braced: any number of hex characters surrounded by braces in the form \\u{1F37A}

// Extract the code point hex digits. Index i points to 'u'
int startIndex = isBracedEscape(string, i) ? i + 2 : i + 1;
int endIndexExclusive = getEndIndexExclusive(string, i, sourceLocation);
// Index for parser to continue at, the last character of the escaped unicode character. Either } or hex digit
int continueIndex = isBracedEscape(string, i) ? endIndexExclusive : endIndexExclusive - 1;

String hexStr = string.substring(startIndex, endIndexExclusive);
Integer codePoint = Integer.parseInt(hexStr, 16);

if (isTrailingSurrogateValue(codePoint)) {
throw new InvalidSyntaxException(sourceLocation, "Invalid unicode - trailing surrogate must be preceded with a leading surrogate -", null, string.substring(i - 1, continueIndex + 1), null);
} else if (isLeadingSurrogateValue(codePoint)) {
if (!isEscapedUnicode(string, continueIndex + 1)) {
throw new InvalidSyntaxException(sourceLocation, "Invalid unicode - leading surrogate must be followed by a trailing surrogate -", null, string.substring(i - 1, continueIndex + 1), null);
}

// Shift parser ahead to 'u' in second escaped Unicode character
i = continueIndex + 2;
int trailingStartIndex = isBracedEscape(string, i) ? i + 2 : i + 1;
int trailingEndIndexExclusive = getEndIndexExclusive(string, i, sourceLocation);
String trailingHexStr = string.substring(trailingStartIndex, trailingEndIndexExclusive);
Integer trailingCodePoint = Integer.parseInt(trailingHexStr, 16);
continueIndex = isBracedEscape(string, i) ? trailingEndIndexExclusive : trailingEndIndexExclusive - 1;

if (isTrailingSurrogateValue(trailingCodePoint)) {
writeCodePoint(writer, codePoint);
writeCodePoint(writer, trailingCodePoint);
return continueIndex;
}

throw new InvalidSyntaxException(sourceLocation, "Invalid unicode - leading surrogate must be followed by a trailing surrogate -", null, string.substring(i - 1, continueIndex + 1), null);
} else if (isValidUnicodeCodePoint(codePoint)) {
writeCodePoint(writer, codePoint);
return continueIndex;
}

throw new InvalidSyntaxException(sourceLocation, "Invalid unicode - not a valid code point -", null, string.substring(i - 1, continueIndex + 1), null);
}

private static int getEndIndexExclusive(String string, int i, SourceLocation sourceLocation) {
// Unbraced case, with exactly 4 hex digits
if (string.length() > i + 5 && !isBracedEscape(string, i)) {
return i + 5;
}

// Braced case, with any number of hex digits
int endIndexExclusive = i + 2;
do {
if (endIndexExclusive + 1 >= string.length()) {
throw new InvalidSyntaxException(sourceLocation, "Invalid unicode - incorrectly formatted escape -", null, string.substring(i - 1, endIndexExclusive), null);
}
} while (string.charAt(++endIndexExclusive) != '}');

return endIndexExclusive;
}

private static boolean isValidUnicodeCodePoint(int value) {
return value <= MAX_UNICODE_CODE_POINT;
}

private static boolean isEscapedUnicode(String string, int index) {
if (index + 1 >= string.length()) {
return false;
}
return string.charAt(index) == '\\' && string.charAt(index + 1) == 'u';
}

private static boolean isLeadingSurrogateValue(int value) {
return LEADING_SURROGATE_LOWER_BOUND <= value && value <= LEADING_SURROGATE_UPPER_BOUND;
}

private static boolean isTrailingSurrogateValue(int value) {
return TRAILING_SURROGATE_LOWER_BOUND <= value && value <= TRAILING_SURROGATE_UPPER_BOUND;
}

private static void writeCodePoint(StringWriter writer, int codepoint) {
char[] chars = Character.toChars(codepoint);
try {
writer.write(chars);
} catch (IOException e) {
assertShouldNeverHappen();
}
}

private static boolean isBracedEscape(String string, int i) {
return string.charAt(i + 1) == '{';
}
}
25 changes: 25 additions & 0 deletions src/test/groovy/graphql/GraphQLTest.groovy
Expand Up @@ -179,6 +179,31 @@ class GraphQLTest extends Specification {
errors[0].locations == [new SourceLocation(1, 8)]
}

def "query with invalid Unicode surrogate in argument - no trailing value"() {
given:
GraphQLSchema schema = newSchema().query(
newObject()
.name("RootQueryType")
.field(newFieldDefinition()
.name("field")
.type(GraphQLString)
.argument(newArgument()
.name("arg")
.type(GraphQLNonNull.nonNull(GraphQLString))))
.build()
).build()

when:
// Invalid Unicode character - leading surrogate value without trailing surrogate value
def errors = GraphQL.newGraphQL(schema).build().execute('{ hello(arg:"\\ud83c") }').errors

then:
errors.size() == 1
errors[0].errorType == ErrorType.InvalidSyntax
errors[0].message == "Invalid Syntax : Invalid unicode - leading surrogate must be followed by a trailing surrogate - offending token '\\ud83c' at line 1 column 13"
errors[0].locations == [new SourceLocation(1, 13)]
}

def "non null argument is missing"() {
given:
GraphQLSchema schema = newSchema().query(
Expand Down
68 changes: 68 additions & 0 deletions src/test/groovy/graphql/parser/ParserTest.groovy
Expand Up @@ -976,4 +976,72 @@ triple3 : """edge cases \\""" "" " \\"" \\" edge cases"""
!type.getIgnoredChars().getLeft().isEmpty()
!type.getIgnoredChars().getRight().isEmpty()
}

def "allow braced escaped unicode"() {
given:
def input = '''
{
foo(arg: "\\u{1F37A}")
}
'''

when:
Document document = Parser.parse(input)
OperationDefinition operationDefinition = (document.definitions[0] as OperationDefinition)
def field = operationDefinition.getSelectionSet().getSelections()[0] as Field
def argValue = field.arguments[0].value as StringValue

then:
argValue.getValue() == "🍺" // contains the beer icon U+1F37A : http://www.charbase.com/1f37a-unicode-beer-mug
}

def "allow surrogate pairs escaped unicode"() {
given:
def input = '''
{
foo(arg: "\\ud83c\\udf7a")
}
'''

when:
Document document = Parser.parse(input)
OperationDefinition operationDefinition = (document.definitions[0] as OperationDefinition)
def field = operationDefinition.getSelectionSet().getSelections()[0] as Field
def argValue = field.arguments[0].value as StringValue

then:
argValue.getValue() == "🍺" // contains the beer icon U+1F37 A : http://www.charbase.com/1f37a-unicode-beer-mug
}

def "invalid surrogate pair - no trailing value"() {
given:
def input = '''
{
foo(arg: "\\ud83c")
}
'''

when:
Parser.parse(input)

then:
InvalidSyntaxException e = thrown(InvalidSyntaxException)
e.message == "Invalid Syntax : Invalid unicode - leading surrogate must be followed by a trailing surrogate - offending token '\\ud83c' at line 3 column 24"
}

def "invalid surrogate pair - no leading value"() {
given:
def input = '''
{
foo(arg: "\\uDC00")
}
'''

when:
Parser.parse(input)

then:
InvalidSyntaxException e = thrown(InvalidSyntaxException)
e.message == "Invalid Syntax : Invalid unicode - trailing surrogate must be preceded with a leading surrogate - offending token '\\uDC00' at line 3 column 24"
}
}