Merge pull request #2404 from dondonz/unicode-full-range

Support full unicode in parser
graphql-java · Jul 14, 2021 · 357c9bb · 357c9bb
2 parents 35e9929 + b60d28a
commit 357c9bb
Show file tree

Hide file tree

Showing 9 changed files with 504 additions and 32 deletions.
diff --git a/src/main/antlr/GraphqlCommon.g4 b/src/main/antlr/GraphqlCommon.g4
@@ -117,31 +117,29 @@ StringValue:
 
 fragment BlockStringCharacter:
 '\\"""'|
-ExtendedSourceCharacter;
+SourceCharacter;
 
+// this is SourceCharacter without
+// \u000a New line
+// \u000d Carriage return
+// \u0022 '"'
+// \u005c '\'
 fragment StringCharacter:
-([\u0009\u0020\u0021] | [\u0023-\u005b] | [\u005d-\u{10FFFF}]) |  // this is SoureCharacter without '"' and '\'
+([\u0000-\u0009] | [\u000b\u000c\u000e-\u0021] | [\u0023-\u005b] | [\u005d-\ud7ff] | [\ue000-\u{10ffff}]) |
 '\\u' EscapedUnicode  |
 '\\' EscapedCharacter;
 
 fragment EscapedCharacter :  ["\\/bfnrt];
-fragment EscapedUnicode : Hex Hex Hex Hex;
+fragment EscapedUnicode : Hex Hex Hex Hex | '{' Hex+ '}';
 fragment Hex : [0-9a-fA-F];
 
+// this is the spec definition. Excludes surrogate leading and trailing values.
+fragment SourceCharacter : [\u0000-\ud7ff] | [\ue000-\u{10ffff}];
 
-// this is currently not covered by the spec because we allow all unicode chars
-// u0009 = \t Horizontal tab
-// u000a = \n line feed
-// u000d = \r carriage return
-// u0020 = space
-fragment ExtendedSourceCharacter :[\u0009\u000A\u000D\u0020-\u{10FFFF}];
-fragment ExtendedSourceCharacterWithoutLineFeed :[\u0009\u0020-\u{10FFFF}];
+// CommentChar
+fragment SourceCharacterWithoutLineFeed : [\u0000-\u0009] | [\u000b\u000c\u000e-\ud7ff] | [\ue000-\u{10ffff}];
 
-// this is the spec definition
-// fragment SourceCharacter :[\u0009\u000A\u000D\u0020-\uFFFF];
-
-
-Comment: '#' ExtendedSourceCharacterWithoutLineFeed* -> channel(2);
+Comment: '#' SourceCharacterWithoutLineFeed* -> channel(2);
 
 LF: [\n] -> channel(3);
 CR: [\r] -> channel(3);

diff --git a/src/main/java/graphql/parser/AntlrHelper.java b/src/main/java/graphql/parser/AntlrHelper.java
@@ -3,6 +3,7 @@
 import graphql.Internal;
 import graphql.language.SourceLocation;
 import org.antlr.v4.runtime.Token;
+import org.antlr.v4.runtime.tree.TerminalNode;
 
 import java.util.List;
 
@@ -28,6 +29,9 @@ public static SourceLocation createSourceLocation(MultiSourceReader multiSourceR
         return AntlrHelper.createSourceLocation(multiSourceReader, token.getLine(), token.getCharPositionInLine());
     }
 
+    public static SourceLocation createSourceLocation(MultiSourceReader multiSourceReader, TerminalNode terminalNode) {
+        return AntlrHelper.createSourceLocation(multiSourceReader, terminalNode.getSymbol().getLine(), terminalNode.getSymbol().getCharPositionInLine());
+    }
 
     /* grabs 3 lines before and after the syntax error */
     public static String createPreview(MultiSourceReader multiSourceReader, int antrlLine) {

diff --git a/src/main/java/graphql/parser/GraphqlAntlrToLanguage.java b/src/main/java/graphql/parser/GraphqlAntlrToLanguage.java
@@ -760,13 +760,14 @@ protected Value createValue(GraphqlParser.ValueContext ctx) {
         return assertShouldNeverHappen();
     }
 
-    static String quotedString(TerminalNode terminalNode) {
+    protected String quotedString(TerminalNode terminalNode) {
         boolean multiLine = terminalNode.getText().startsWith("\"\"\"");
         String strText = terminalNode.getText();
+        SourceLocation sourceLocation = AntlrHelper.createSourceLocation(multiSourceReader, terminalNode);
         if (multiLine) {
             return parseTripleQuotedString(strText);
         } else {
-            return parseSingleQuotedString(strText);
+            return parseSingleQuotedString(strText, sourceLocation);
         }
     }
 
@@ -839,12 +840,12 @@ protected Description newDescription(GraphqlParser.DescriptionContext descriptio
         }
         String content = terminalNode.getText();
         boolean multiLine = content.startsWith("\"\"\"");
+        SourceLocation sourceLocation = getSourceLocation(descriptionCtx);
         if (multiLine) {
             content = parseTripleQuotedString(content);
         } else {
-            content = parseSingleQuotedString(content);
+            content = parseSingleQuotedString(content, sourceLocation);
         }
-        SourceLocation sourceLocation = getSourceLocation(descriptionCtx);
         return new Description(content, sourceLocation, multiLine);
     }
 

diff --git a/src/main/java/graphql/parser/StringValueParsing.java b/src/main/java/graphql/parser/StringValueParsing.java
@@ -2,6 +2,7 @@
 
 import graphql.Assert;
 import graphql.Internal;
+import graphql.language.SourceLocation;
 
 import java.io.StringWriter;
 import java.util.ArrayList;
@@ -30,7 +31,9 @@ public static String removeIndentation(String rawValue) {
         String[] lines = rawValue.split("\\n");
         Integer commonIndent = null;
         for (int i = 0; i < lines.length; i++) {
-            if (i == 0) continue;
+            if (i == 0) {
+                continue;
+            }
             String line = lines[i];
             int length = line.length();
             int indent = leadingWhitespace(line);
@@ -44,7 +47,9 @@ public static String removeIndentation(String rawValue) {
         if (commonIndent != null) {
             for (int i = 0; i < lineList.size(); i++) {
                 String line = lineList.get(i);
-                if (i == 0) continue;
+                if (i == 0) {
+                    continue;
+                }
                 if (line.length() > commonIndent) {
                     line = line.substring(commonIndent);
                     lineList.set(i, line);
@@ -98,7 +103,7 @@ private static boolean containsOnlyWhiteSpace(String str) {
         return leadingWhitespace(str) == str.length();
     }
 
-    public static String parseSingleQuotedString(String string) {
+    public static String parseSingleQuotedString(String string, SourceLocation sourceLocation) {
         StringWriter writer = new StringWriter(string.length() - 2);
         int end = string.length() - 1;
         for (int i = 1; i < end; i++) {
@@ -135,15 +140,16 @@ public static String parseSingleQuotedString(String string) {
                     writer.write('\t');
                     continue;
                 case 'u':
-                    String hexStr = string.substring(i + 1, i + 5);
-                    int codepoint = Integer.parseInt(hexStr, 16);
-                    i += 4;
-                    writer.write(codepoint);
+                    i = UnicodeUtil.parseAndWriteUnicode(writer, string, i, sourceLocation);
                     continue;
                 default:
                     Assert.assertShouldNeverHappen();
             }
         }
         return writer.toString();
     }
+
+    public static String parseSingleQuotedString(String string) {
+        return parseSingleQuotedString(string, null);
+    }
 }
diff --git a/src/main/java/graphql/parser/UnicodeUtil.java b/src/main/java/graphql/parser/UnicodeUtil.java
@@ -0,0 +1,114 @@
+package graphql.parser;
+
+import graphql.Internal;
+import graphql.language.SourceLocation;
+
+import java.io.IOException;
+import java.io.StringWriter;
+
+import static graphql.Assert.assertShouldNeverHappen;
+
+/**
+ * Contains Unicode helpers for parsing StringValue types in the grammar
+ */
+@Internal
+public class UnicodeUtil {
+    public static int MAX_UNICODE_CODE_POINT = 0x10FFFF;
+    public static int LEADING_SURROGATE_LOWER_BOUND = 0xD800;
+    public static int LEADING_SURROGATE_UPPER_BOUND = 0xDBFF;
+    public static int TRAILING_SURROGATE_LOWER_BOUND = 0xDC00;
+    public static int TRAILING_SURROGATE_UPPER_BOUND = 0xDFFF;
+
+    public static int parseAndWriteUnicode(StringWriter writer, String string, int i, SourceLocation sourceLocation) {
+        // Unicode code points can either be:
+        //  1. Unbraced: four hex characters in the form \\u597D, or
+        //  2. Braced: any number of hex characters surrounded by braces in the form \\u{1F37A}
+
+        // Extract the code point hex digits. Index i points to 'u'
+        int startIndex = isBracedEscape(string, i) ? i + 2 : i + 1;
+        int endIndexExclusive = getEndIndexExclusive(string, i, sourceLocation);
+        // Index for parser to continue at, the last character of the escaped unicode character. Either } or hex digit
+        int continueIndex = isBracedEscape(string, i) ? endIndexExclusive : endIndexExclusive - 1;
+
+        String hexStr = string.substring(startIndex, endIndexExclusive);
+        Integer codePoint = Integer.parseInt(hexStr, 16);
+
+        if (isTrailingSurrogateValue(codePoint)) {
+            throw new InvalidSyntaxException(sourceLocation, "Invalid unicode - trailing surrogate must be preceded with a leading surrogate -", null, string.substring(i - 1, continueIndex + 1), null);
+        } else if (isLeadingSurrogateValue(codePoint)) {
+            if (!isEscapedUnicode(string, continueIndex + 1)) {
+                throw new InvalidSyntaxException(sourceLocation, "Invalid unicode - leading surrogate must be followed by a trailing surrogate -", null, string.substring(i - 1, continueIndex + 1), null);
+            }
+
+            // Shift parser ahead to 'u' in second escaped Unicode character
+            i = continueIndex + 2;
+            int trailingStartIndex = isBracedEscape(string, i) ? i + 2 : i + 1;
+            int trailingEndIndexExclusive = getEndIndexExclusive(string, i, sourceLocation);
+            String trailingHexStr = string.substring(trailingStartIndex, trailingEndIndexExclusive);
+            Integer trailingCodePoint = Integer.parseInt(trailingHexStr, 16);
+            continueIndex = isBracedEscape(string, i) ? trailingEndIndexExclusive : trailingEndIndexExclusive - 1;
+
+            if (isTrailingSurrogateValue(trailingCodePoint)) {
+                writeCodePoint(writer, codePoint);
+                writeCodePoint(writer, trailingCodePoint);
+                return continueIndex;
+            }
+
+            throw new InvalidSyntaxException(sourceLocation, "Invalid unicode - leading surrogate must be followed by a trailing surrogate -", null, string.substring(i - 1, continueIndex + 1), null);
+        } else if (isValidUnicodeCodePoint(codePoint)) {
+            writeCodePoint(writer, codePoint);
+            return continueIndex;
+        }
+
+        throw new InvalidSyntaxException(sourceLocation, "Invalid unicode - not a valid code point -", null, string.substring(i - 1, continueIndex + 1), null);
+    }
+
+    private static int getEndIndexExclusive(String string, int i, SourceLocation sourceLocation) {
+        // Unbraced case, with exactly 4 hex digits
+        if (string.length() > i + 5 && !isBracedEscape(string, i)) {
+            return i + 5;
+        }
+
+        // Braced case, with any number of hex digits
+        int endIndexExclusive = i + 2;
+        do {
+            if (endIndexExclusive + 1 >= string.length()) {
+                throw new InvalidSyntaxException(sourceLocation, "Invalid unicode - incorrectly formatted escape -", null, string.substring(i - 1, endIndexExclusive), null);
+            }
+        } while (string.charAt(++endIndexExclusive) != '}');
+
+        return endIndexExclusive;
+    }
+
+    private static boolean isValidUnicodeCodePoint(int value) {
+        return value <= MAX_UNICODE_CODE_POINT;
+    }
+
+    private static boolean isEscapedUnicode(String string, int index) {
+        if (index + 1 >= string.length()) {
+            return false;
+        }
+        return string.charAt(index) == '\\' && string.charAt(index + 1) == 'u';
+    }
+
+    private static boolean isLeadingSurrogateValue(int value) {
+        return LEADING_SURROGATE_LOWER_BOUND <= value && value <= LEADING_SURROGATE_UPPER_BOUND;
+    }
+
+    private static boolean isTrailingSurrogateValue(int value) {
+        return TRAILING_SURROGATE_LOWER_BOUND <= value && value <= TRAILING_SURROGATE_UPPER_BOUND;
+    }
+
+    private static void writeCodePoint(StringWriter writer, int codepoint) {
+        char[] chars = Character.toChars(codepoint);
+        try {
+            writer.write(chars);
+        } catch (IOException e) {
+            assertShouldNeverHappen();
+        }
+    }
+
+    private static boolean isBracedEscape(String string, int i) {
+        return string.charAt(i + 1) == '{';
+    }
+}
diff --git a/src/test/groovy/graphql/GraphQLTest.groovy b/src/test/groovy/graphql/GraphQLTest.groovy
@@ -182,6 +182,31 @@ class GraphQLTest extends Specification {
         errors[0].locations == [new SourceLocation(1, 8)]
     }
 
+    def "query with invalid Unicode surrogate in argument - no trailing value"() {
+        given:
+        GraphQLSchema schema = newSchema().query(
+                newObject()
+                        .name("RootQueryType")
+                        .field(newFieldDefinition()
+                                .name("field")
+                                .type(GraphQLString)
+                                .argument(newArgument()
+                                        .name("arg")
+                                        .type(GraphQLNonNull.nonNull(GraphQLString))))
+                        .build()
+        ).build()
+
+        when:
+        // Invalid Unicode character - leading surrogate value without trailing surrogate value
+        def errors = GraphQL.newGraphQL(schema).build().execute('{ hello(arg:"\\ud83c") }').errors
+
+        then:
+        errors.size() == 1
+        errors[0].errorType == ErrorType.InvalidSyntax
+        errors[0].message == "Invalid Syntax : Invalid unicode - leading surrogate must be followed by a trailing surrogate - offending token '\\ud83c' at line 1 column 13"
+        errors[0].locations == [new SourceLocation(1, 13)]
+    }
+
     def "non null argument is missing"() {
         given:
         GraphQLSchema schema = newSchema().query(

diff --git a/src/test/groovy/graphql/parser/ParserTest.groovy b/src/test/groovy/graphql/parser/ParserTest.groovy
@@ -983,4 +983,72 @@ triple3 : """edge cases \\""" "" " \\"" \\" edge cases"""
         !type.getIgnoredChars().getLeft().isEmpty()
         !type.getIgnoredChars().getRight().isEmpty()
     }
+
+    def "allow braced escaped unicode"() {
+        given:
+        def input = '''
+              {
+              foo(arg: "\\u{1F37A}")
+               }
+        '''
+
+        when:
+        Document document = Parser.parse(input)
+        OperationDefinition operationDefinition = (document.definitions[0] as OperationDefinition)
+        def field = operationDefinition.getSelectionSet().getSelections()[0] as Field
+        def argValue = field.arguments[0].value as StringValue
+
+        then:
+        argValue.getValue() == "🍺" // contains the beer icon U+1F37A : http://www.charbase.com/1f37a-unicode-beer-mug
+    }
+
+    def "allow surrogate pairs escaped unicode"() {
+        given:
+        def input = '''
+              {
+              foo(arg: "\\ud83c\\udf7a")
+               }
+        '''
+
+        when:
+        Document document = Parser.parse(input)
+        OperationDefinition operationDefinition = (document.definitions[0] as OperationDefinition)
+        def field = operationDefinition.getSelectionSet().getSelections()[0] as Field
+        def argValue = field.arguments[0].value as StringValue
+
+        then:
+        argValue.getValue() == "🍺" // contains the beer icon U+1F37 A : http://www.charbase.com/1f37a-unicode-beer-mug
+    }
+
+    def "invalid surrogate pair - no trailing value"() {
+        given:
+        def input = '''
+              {
+              foo(arg: "\\ud83c")
+               }
+        '''
+
+        when:
+        Parser.parse(input)
+
+        then:
+        InvalidSyntaxException e = thrown(InvalidSyntaxException)
+        e.message == "Invalid Syntax : Invalid unicode - leading surrogate must be followed by a trailing surrogate - offending token '\\ud83c' at line 3 column 24"
+    }
+
+    def "invalid surrogate pair - no leading value"() {
+        given:
+        def input = '''
+              {
+              foo(arg: "\\uDC00")
+               }
+        '''
+
+        when:
+        Parser.parse(input)
+
+        then:
+        InvalidSyntaxException e = thrown(InvalidSyntaxException)
+        e.message == "Invalid Syntax : Invalid unicode - trailing surrogate must be preceded with a leading surrogate - offending token '\\uDC00' at line 3 column 24"
+    }
 }