Skip to content

Commit

Permalink
Merge pull request #2404 from dondonz/unicode-full-range
Browse files Browse the repository at this point in the history
Support full unicode in parser
  • Loading branch information
andimarek committed Jul 14, 2021
2 parents 35e9929 + b60d28a commit 357c9bb
Show file tree
Hide file tree
Showing 9 changed files with 504 additions and 32 deletions.
28 changes: 13 additions & 15 deletions src/main/antlr/GraphqlCommon.g4
Expand Up @@ -117,31 +117,29 @@ StringValue:

fragment BlockStringCharacter:
'\\"""'|
ExtendedSourceCharacter;
SourceCharacter;

// this is SourceCharacter without
// \u000a New line
// \u000d Carriage return
// \u0022 '"'
// \u005c '\'
fragment StringCharacter:
([\u0009\u0020\u0021] | [\u0023-\u005b] | [\u005d-\u{10FFFF}]) | // this is SoureCharacter without '"' and '\'
([\u0000-\u0009] | [\u000b\u000c\u000e-\u0021] | [\u0023-\u005b] | [\u005d-\ud7ff] | [\ue000-\u{10ffff}]) |
'\\u' EscapedUnicode |
'\\' EscapedCharacter;

fragment EscapedCharacter : ["\\/bfnrt];
fragment EscapedUnicode : Hex Hex Hex Hex;
fragment EscapedUnicode : Hex Hex Hex Hex | '{' Hex+ '}';
fragment Hex : [0-9a-fA-F];
// this is the spec definition. Excludes surrogate leading and trailing values.
fragment SourceCharacter : [\u0000-\ud7ff] | [\ue000-\u{10ffff}];
// this is currently not covered by the spec because we allow all unicode chars
// u0009 = \t Horizontal tab
// u000a = \n line feed
// u000d = \r carriage return
// u0020 = space
fragment ExtendedSourceCharacter :[\u0009\u000A\u000D\u0020-\u{10FFFF}];
fragment ExtendedSourceCharacterWithoutLineFeed :[\u0009\u0020-\u{10FFFF}];
// CommentChar
fragment SourceCharacterWithoutLineFeed : [\u0000-\u0009] | [\u000b\u000c\u000e-\ud7ff] | [\ue000-\u{10ffff}];
// this is the spec definition
// fragment SourceCharacter :[\u0009\u000A\u000D\u0020-\uFFFF];
Comment: '#' ExtendedSourceCharacterWithoutLineFeed* -> channel(2);
Comment: '#' SourceCharacterWithoutLineFeed* -> channel(2);
LF: [\n] -> channel(3);
CR: [\r] -> channel(3);
Expand Down
4 changes: 4 additions & 0 deletions src/main/java/graphql/parser/AntlrHelper.java
Expand Up @@ -3,6 +3,7 @@
import graphql.Internal;
import graphql.language.SourceLocation;
import org.antlr.v4.runtime.Token;
import org.antlr.v4.runtime.tree.TerminalNode;

import java.util.List;

Expand All @@ -28,6 +29,9 @@ public static SourceLocation createSourceLocation(MultiSourceReader multiSourceR
return AntlrHelper.createSourceLocation(multiSourceReader, token.getLine(), token.getCharPositionInLine());
}

public static SourceLocation createSourceLocation(MultiSourceReader multiSourceReader, TerminalNode terminalNode) {
return AntlrHelper.createSourceLocation(multiSourceReader, terminalNode.getSymbol().getLine(), terminalNode.getSymbol().getCharPositionInLine());
}

/* grabs 3 lines before and after the syntax error */
public static String createPreview(MultiSourceReader multiSourceReader, int antrlLine) {
Expand Down
9 changes: 5 additions & 4 deletions src/main/java/graphql/parser/GraphqlAntlrToLanguage.java
Expand Up @@ -760,13 +760,14 @@ protected Value createValue(GraphqlParser.ValueContext ctx) {
return assertShouldNeverHappen();
}

static String quotedString(TerminalNode terminalNode) {
protected String quotedString(TerminalNode terminalNode) {
boolean multiLine = terminalNode.getText().startsWith("\"\"\"");
String strText = terminalNode.getText();
SourceLocation sourceLocation = AntlrHelper.createSourceLocation(multiSourceReader, terminalNode);
if (multiLine) {
return parseTripleQuotedString(strText);
} else {
return parseSingleQuotedString(strText);
return parseSingleQuotedString(strText, sourceLocation);
}
}

Expand Down Expand Up @@ -839,12 +840,12 @@ protected Description newDescription(GraphqlParser.DescriptionContext descriptio
}
String content = terminalNode.getText();
boolean multiLine = content.startsWith("\"\"\"");
SourceLocation sourceLocation = getSourceLocation(descriptionCtx);
if (multiLine) {
content = parseTripleQuotedString(content);
} else {
content = parseSingleQuotedString(content);
content = parseSingleQuotedString(content, sourceLocation);
}
SourceLocation sourceLocation = getSourceLocation(descriptionCtx);
return new Description(content, sourceLocation, multiLine);
}

Expand Down
20 changes: 13 additions & 7 deletions src/main/java/graphql/parser/StringValueParsing.java
Expand Up @@ -2,6 +2,7 @@

import graphql.Assert;
import graphql.Internal;
import graphql.language.SourceLocation;

import java.io.StringWriter;
import java.util.ArrayList;
Expand Down Expand Up @@ -30,7 +31,9 @@ public static String removeIndentation(String rawValue) {
String[] lines = rawValue.split("\\n");
Integer commonIndent = null;
for (int i = 0; i < lines.length; i++) {
if (i == 0) continue;
if (i == 0) {
continue;
}
String line = lines[i];
int length = line.length();
int indent = leadingWhitespace(line);
Expand All @@ -44,7 +47,9 @@ public static String removeIndentation(String rawValue) {
if (commonIndent != null) {
for (int i = 0; i < lineList.size(); i++) {
String line = lineList.get(i);
if (i == 0) continue;
if (i == 0) {
continue;
}
if (line.length() > commonIndent) {
line = line.substring(commonIndent);
lineList.set(i, line);
Expand Down Expand Up @@ -98,7 +103,7 @@ private static boolean containsOnlyWhiteSpace(String str) {
return leadingWhitespace(str) == str.length();
}

public static String parseSingleQuotedString(String string) {
public static String parseSingleQuotedString(String string, SourceLocation sourceLocation) {
StringWriter writer = new StringWriter(string.length() - 2);
int end = string.length() - 1;
for (int i = 1; i < end; i++) {
Expand Down Expand Up @@ -135,15 +140,16 @@ public static String parseSingleQuotedString(String string) {
writer.write('\t');
continue;
case 'u':
String hexStr = string.substring(i + 1, i + 5);
int codepoint = Integer.parseInt(hexStr, 16);
i += 4;
writer.write(codepoint);
i = UnicodeUtil.parseAndWriteUnicode(writer, string, i, sourceLocation);
continue;
default:
Assert.assertShouldNeverHappen();
}
}
return writer.toString();
}

public static String parseSingleQuotedString(String string) {
return parseSingleQuotedString(string, null);
}
}
114 changes: 114 additions & 0 deletions src/main/java/graphql/parser/UnicodeUtil.java
@@ -0,0 +1,114 @@
package graphql.parser;

import graphql.Internal;
import graphql.language.SourceLocation;

import java.io.IOException;
import java.io.StringWriter;

import static graphql.Assert.assertShouldNeverHappen;

/**
* Contains Unicode helpers for parsing StringValue types in the grammar
*/
@Internal
public class UnicodeUtil {
public static int MAX_UNICODE_CODE_POINT = 0x10FFFF;
public static int LEADING_SURROGATE_LOWER_BOUND = 0xD800;
public static int LEADING_SURROGATE_UPPER_BOUND = 0xDBFF;
public static int TRAILING_SURROGATE_LOWER_BOUND = 0xDC00;
public static int TRAILING_SURROGATE_UPPER_BOUND = 0xDFFF;

public static int parseAndWriteUnicode(StringWriter writer, String string, int i, SourceLocation sourceLocation) {
// Unicode code points can either be:
// 1. Unbraced: four hex characters in the form \\u597D, or
// 2. Braced: any number of hex characters surrounded by braces in the form \\u{1F37A}

// Extract the code point hex digits. Index i points to 'u'
int startIndex = isBracedEscape(string, i) ? i + 2 : i + 1;
int endIndexExclusive = getEndIndexExclusive(string, i, sourceLocation);
// Index for parser to continue at, the last character of the escaped unicode character. Either } or hex digit
int continueIndex = isBracedEscape(string, i) ? endIndexExclusive : endIndexExclusive - 1;

String hexStr = string.substring(startIndex, endIndexExclusive);
Integer codePoint = Integer.parseInt(hexStr, 16);

if (isTrailingSurrogateValue(codePoint)) {
throw new InvalidSyntaxException(sourceLocation, "Invalid unicode - trailing surrogate must be preceded with a leading surrogate -", null, string.substring(i - 1, continueIndex + 1), null);
} else if (isLeadingSurrogateValue(codePoint)) {
if (!isEscapedUnicode(string, continueIndex + 1)) {
throw new InvalidSyntaxException(sourceLocation, "Invalid unicode - leading surrogate must be followed by a trailing surrogate -", null, string.substring(i - 1, continueIndex + 1), null);
}

// Shift parser ahead to 'u' in second escaped Unicode character
i = continueIndex + 2;
int trailingStartIndex = isBracedEscape(string, i) ? i + 2 : i + 1;
int trailingEndIndexExclusive = getEndIndexExclusive(string, i, sourceLocation);
String trailingHexStr = string.substring(trailingStartIndex, trailingEndIndexExclusive);
Integer trailingCodePoint = Integer.parseInt(trailingHexStr, 16);
continueIndex = isBracedEscape(string, i) ? trailingEndIndexExclusive : trailingEndIndexExclusive - 1;

if (isTrailingSurrogateValue(trailingCodePoint)) {
writeCodePoint(writer, codePoint);
writeCodePoint(writer, trailingCodePoint);
return continueIndex;
}

throw new InvalidSyntaxException(sourceLocation, "Invalid unicode - leading surrogate must be followed by a trailing surrogate -", null, string.substring(i - 1, continueIndex + 1), null);
} else if (isValidUnicodeCodePoint(codePoint)) {
writeCodePoint(writer, codePoint);
return continueIndex;
}

throw new InvalidSyntaxException(sourceLocation, "Invalid unicode - not a valid code point -", null, string.substring(i - 1, continueIndex + 1), null);
}

private static int getEndIndexExclusive(String string, int i, SourceLocation sourceLocation) {
// Unbraced case, with exactly 4 hex digits
if (string.length() > i + 5 && !isBracedEscape(string, i)) {
return i + 5;
}

// Braced case, with any number of hex digits
int endIndexExclusive = i + 2;
do {
if (endIndexExclusive + 1 >= string.length()) {
throw new InvalidSyntaxException(sourceLocation, "Invalid unicode - incorrectly formatted escape -", null, string.substring(i - 1, endIndexExclusive), null);
}
} while (string.charAt(++endIndexExclusive) != '}');

return endIndexExclusive;
}

private static boolean isValidUnicodeCodePoint(int value) {
return value <= MAX_UNICODE_CODE_POINT;
}

private static boolean isEscapedUnicode(String string, int index) {
if (index + 1 >= string.length()) {
return false;
}
return string.charAt(index) == '\\' && string.charAt(index + 1) == 'u';
}

private static boolean isLeadingSurrogateValue(int value) {
return LEADING_SURROGATE_LOWER_BOUND <= value && value <= LEADING_SURROGATE_UPPER_BOUND;
}

private static boolean isTrailingSurrogateValue(int value) {
return TRAILING_SURROGATE_LOWER_BOUND <= value && value <= TRAILING_SURROGATE_UPPER_BOUND;
}

private static void writeCodePoint(StringWriter writer, int codepoint) {
char[] chars = Character.toChars(codepoint);
try {
writer.write(chars);
} catch (IOException e) {
assertShouldNeverHappen();
}
}

private static boolean isBracedEscape(String string, int i) {
return string.charAt(i + 1) == '{';
}
}
25 changes: 25 additions & 0 deletions src/test/groovy/graphql/GraphQLTest.groovy
Expand Up @@ -182,6 +182,31 @@ class GraphQLTest extends Specification {
errors[0].locations == [new SourceLocation(1, 8)]
}

def "query with invalid Unicode surrogate in argument - no trailing value"() {
given:
GraphQLSchema schema = newSchema().query(
newObject()
.name("RootQueryType")
.field(newFieldDefinition()
.name("field")
.type(GraphQLString)
.argument(newArgument()
.name("arg")
.type(GraphQLNonNull.nonNull(GraphQLString))))
.build()
).build()

when:
// Invalid Unicode character - leading surrogate value without trailing surrogate value
def errors = GraphQL.newGraphQL(schema).build().execute('{ hello(arg:"\\ud83c") }').errors

then:
errors.size() == 1
errors[0].errorType == ErrorType.InvalidSyntax
errors[0].message == "Invalid Syntax : Invalid unicode - leading surrogate must be followed by a trailing surrogate - offending token '\\ud83c' at line 1 column 13"
errors[0].locations == [new SourceLocation(1, 13)]
}

def "non null argument is missing"() {
given:
GraphQLSchema schema = newSchema().query(
Expand Down
68 changes: 68 additions & 0 deletions src/test/groovy/graphql/parser/ParserTest.groovy
Expand Up @@ -983,4 +983,72 @@ triple3 : """edge cases \\""" "" " \\"" \\" edge cases"""
!type.getIgnoredChars().getLeft().isEmpty()
!type.getIgnoredChars().getRight().isEmpty()
}

def "allow braced escaped unicode"() {
given:
def input = '''
{
foo(arg: "\\u{1F37A}")
}
'''

when:
Document document = Parser.parse(input)
OperationDefinition operationDefinition = (document.definitions[0] as OperationDefinition)
def field = operationDefinition.getSelectionSet().getSelections()[0] as Field
def argValue = field.arguments[0].value as StringValue

then:
argValue.getValue() == "🍺" // contains the beer icon U+1F37A : http://www.charbase.com/1f37a-unicode-beer-mug
}

def "allow surrogate pairs escaped unicode"() {
given:
def input = '''
{
foo(arg: "\\ud83c\\udf7a")
}
'''

when:
Document document = Parser.parse(input)
OperationDefinition operationDefinition = (document.definitions[0] as OperationDefinition)
def field = operationDefinition.getSelectionSet().getSelections()[0] as Field
def argValue = field.arguments[0].value as StringValue

then:
argValue.getValue() == "🍺" // contains the beer icon U+1F37 A : http://www.charbase.com/1f37a-unicode-beer-mug
}

def "invalid surrogate pair - no trailing value"() {
given:
def input = '''
{
foo(arg: "\\ud83c")
}
'''

when:
Parser.parse(input)

then:
InvalidSyntaxException e = thrown(InvalidSyntaxException)
e.message == "Invalid Syntax : Invalid unicode - leading surrogate must be followed by a trailing surrogate - offending token '\\ud83c' at line 3 column 24"
}

def "invalid surrogate pair - no leading value"() {
given:
def input = '''
{
foo(arg: "\\uDC00")
}
'''

when:
Parser.parse(input)

then:
InvalidSyntaxException e = thrown(InvalidSyntaxException)
e.message == "Invalid Syntax : Invalid unicode - trailing surrogate must be preceded with a leading surrogate - offending token '\\uDC00' at line 3 column 24"
}
}

0 comments on commit 357c9bb

Please sign in to comment.