graphql-java · andimarek · Jul 14, 2021 · Jun 28, 2021 · Jun 29, 2021 · Jun 29, 2021
diff --git a/src/main/antlr/GraphqlCommon.g4 b/src/main/antlr/GraphqlCommon.g4
@@ -120,12 +120,12 @@ fragment BlockStringCharacter:
 ExtendedSourceCharacter;
 
 fragment StringCharacter:
-([\u0009\u0020\u0021] | [\u0023-\u005b] | [\u005d-\u{10FFFF}]) |  // this is SoureCharacter without '"' and '\'
+([\u0009\u0020\u0021] | [\u0023-\u005b] | [\u005d-\u{10FFFF}]) |  // this is SourceCharacter without '"' and '\'
 '\\u' EscapedUnicode  |
 '\\' EscapedCharacter;
 
 fragment EscapedCharacter :  ["\\/bfnrt];
-fragment EscapedUnicode : Hex Hex Hex Hex;
+fragment EscapedUnicode : Hex Hex Hex Hex | '{' Hex+ '}';
 fragment Hex : [0-9a-fA-F];
 
 

diff --git a/src/main/java/graphql/parser/StringValueParsing.java b/src/main/java/graphql/parser/StringValueParsing.java
@@ -30,7 +30,9 @@ public static String removeIndentation(String rawValue) {
         String[] lines = rawValue.split("\\n");
         Integer commonIndent = null;
         for (int i = 0; i < lines.length; i++) {
-            if (i == 0) continue;
+            if (i == 0) {
+                continue;
+            }
             String line = lines[i];
             int length = line.length();
             int indent = leadingWhitespace(line);
@@ -44,7 +46,9 @@ public static String removeIndentation(String rawValue) {
         if (commonIndent != null) {
             for (int i = 0; i < lineList.size(); i++) {
                 String line = lineList.get(i);
-                if (i == 0) continue;
+                if (i == 0) {
+                    continue;
+                }
                 if (line.length() > commonIndent) {
                     line = line.substring(commonIndent);
                     lineList.set(i, line);
@@ -135,10 +139,7 @@ public static String parseSingleQuotedString(String string) {
                     writer.write('\t');
                     continue;
                 case 'u':
-                    String hexStr = string.substring(i + 1, i + 5);
-                    int codepoint = Integer.parseInt(hexStr, 16);
-                    i += 4;
-                    writer.write(codepoint);
+                    i = UnicodeUtil.parseAndWriteUnicode(writer, string, i);
                     continue;
                 default:
                     Assert.assertShouldNeverHappen();

diff --git a/src/main/java/graphql/parser/UnicodeUtil.java b/src/main/java/graphql/parser/UnicodeUtil.java
@@ -0,0 +1,64 @@
+package graphql.parser;
+
+import graphql.Assert;
+import graphql.Internal;
+
+import java.io.IOException;
+import java.io.StringWriter;
+
+/**
+ * Contains Unicode helpers for parsing StringValue types in the grammar
+ */
+@Internal
+public class UnicodeUtil {
+    public static int MAX_UNICODE_CODE_POINT = 0x10FFFF;
+
+    public static int parseAndWriteUnicode(StringWriter writer, String string, int i) {
+        // Unicode characters can either be:
+        //  - four hex characters in the form \\u597D, or
+        //  - any number of hex characters surrounded by a brace in the form \\u{1F37A}
+
+        // Four hex character only case \\u597D, for code points in the Basic Multilingual Plane (BMP)
+        if (isNotBracedEscape(string, i)) {
+            String hexStr = string.substring(i + 1, i + 5);
+            int codepoint = Integer.parseInt(hexStr, 16);
+            writer.write(codepoint);
+            return i + 4;
+            // TODO error checking of invalid values
+        } else {
+            // Any number of hex characters e.g. \\u{1F37A}, which allows code points outside the Basic Multilingual Plane (BMP)
+            int startIx = i + 2;
+            int endIndexExclusive = startIx;
+            do {
+                if (endIndexExclusive + 1 >= string.length()) {
+                    throw new RuntimeException("invalid unicode encoding");
+                }
+            } while (string.charAt(++endIndexExclusive) != '}');
+
+            String hexStr = string.substring(startIx, endIndexExclusive);
+            Integer hexValue = Integer.parseInt(hexStr, 16);
+            if (isValidUnicodeCodePoint(hexValue)) {
+                char[] chars = Character.toChars(hexValue);
+                try {
+                    writer.write(chars);
+                } catch (IOException e) {
+                    return Assert.assertShouldNeverHappen();
+                }
+                return endIndexExclusive;
+            } else {
+                throw new RuntimeException("invalid unicode code point");
+            }
+        }
+//        Assert.assertShouldNeverHappen();
+        // TODO error checking of invalid values
+    }
+
+    private static boolean isNotBracedEscape(String string, int i) {
+        return string.charAt(i + 1) != '{';
+    }
+
+    private static boolean isValidUnicodeCodePoint(Integer value) {
+        // TODO: Add bad surrogate checks
+        return value <= MAX_UNICODE_CODE_POINT;
+    }
+}
diff --git a/src/test/groovy/graphql/parser/StringValueParsingTest.groovy b/src/test/groovy/graphql/parser/StringValueParsingTest.groovy
@@ -40,8 +40,7 @@ class StringValueParsingTest extends Specification {
         parsed == '''"'''
     }
 
-    def "parsing emoji should work"() {
-        // needs surrogate pairs for this emoji
+    def "parsing beer stein as surrogate pair should work"() {
         given:
         def input = '''"\\ud83c\\udf7a"'''
 
@@ -52,18 +51,17 @@ class StringValueParsingTest extends Specification {
         parsed == '''🍺''' // contains the beer icon 	U+1F37A  : http://www.charbase.com/1f37a-unicode-beer-mug
     }
 
-    def "parsing simple unicode should work"() {
+    def "parsing simple unicode should work - Basic Multilingual Plane (BMP)"() {
         given:
-        def input = '''"\\u56fe"'''
+        def input = '''"\\u5564\\u9152"'''
 
         when:
         String parsed = StringValueParsing.parseSingleQuotedString(input)
 
         then:
-        parsed == '''图'''
+        parsed == '''啤酒'''
     }
 
-
     def "parsing triple quoted string should work"() {
         given:
         def input = '''"""triple quoted"""'''

diff --git a/src/test/groovy/graphql/parser/UnicodeUtilParserTest.groovy b/src/test/groovy/graphql/parser/UnicodeUtilParserTest.groovy
@@ -0,0 +1,199 @@
+package graphql.parser
+
+import graphql.language.Document
+import graphql.language.Field
+import graphql.language.OperationDefinition
+import graphql.language.StringValue
+import graphql.schema.validation.InvalidSchemaException
+import spock.lang.Ignore
+import spock.lang.Specification
+
+class UnicodeUtilParserTest extends Specification {
+    /*
+        Implements RFC to support full Unicode
+        Original RFC https://github.com/graphql/graphql-spec/issues/687
+        RFC spec text https://github.com/graphql/graphql-spec/pull/849
+        RFC JS implementation https://github.com/graphql/graphql-js/pull/3117
+
+        TL;DR
+        Previously, valid SourceCharacters included Unicode scalar values up to and including U+FFFF - the Basic Multilingual Plane (BMP)
+        Now this is changing to incorporate all Unicode scalar values
+        Assert {value} is a within the *Unicode scalar value* range (>= 0x0000 and <= 0xD7FF or >= 0xE000 and <= 0x10FFFF).
+        Practically this means you can have your beer emoji (U+1F37A) in queries as \\u{1F37A}
+    */
+
+    // With this RFC, code points outside the Basic Multilingual Plane can be parsed. For example, emojis
+    // Previously emojis could only be parsed with surrogate pairs. Now they can be parsed with the code point directly
+    def "parsing beer stein as escaped unicode"() {
+        given:
+        def input = '''"\\u{1F37A} hello"'''
+
+        when:
+        String parsed = StringValueParsing.parseSingleQuotedString(input)
+
+        then:
+        parsed == '''🍺 hello''' // contains the beer icon U+1F37A : http://www.charbase.com/1f37a-unicode-beer-mug
+    }
+
+    def "parsing beer mug non escaped"() {
+        given:
+        def input = '''"🍺 hello"'''
+
+        when:
+        String parsed = StringValueParsing.parseSingleQuotedString(input)
+
+        then:
+        parsed == '''🍺 hello''' // contains the beer icon U+1F37A : http://www.charbase.com/1f37a-unicode-beer-mug
+    }
+
+    def "allow braced escaped unicode"() {
+        def input = '''
+              {
+              foo(arg: "\\u{1F37A}")
+               }
+        '''
+
+        when:
+        Document document = Parser.parse(input)
+        OperationDefinition operationDefinition = (document.definitions[0] as OperationDefinition)
+        def field = operationDefinition.getSelectionSet().getSelections()[0] as Field
+        def argValue = field.arguments[0].value as StringValue
+
+        then:
+        argValue.getValue() == "🍺"
+    }
+
+    /*
+        From the RFC:
+        For legacy reasons, a *supplementary character* may be escaped by two
+        fixed-width unicode escape sequences forming a *surrogate pair*. For example
+        the input `"\\uD83D\\uDCA9"` is a valid {StringValue} which represents the same
+        Unicode text as `"\\u{1F4A9}"`. While this legacy form is allowed, it should be
+        avoided as a variable-width unicode escape sequence is a clearer way to encode
+        such code points.
+    */
+    def "allow surrogate pairs escaped unicode"() {
+        def input = '''
+              {
+              foo(arg: "\\ud83c\\udf7a")
+               }
+        '''
+
+        when:
+        Document document = Parser.parse(input)
+        OperationDefinition operationDefinition = (document.definitions[0] as OperationDefinition)
+        def field = operationDefinition.getSelectionSet().getSelections()[0] as Field
+        def argValue = field.arguments[0].value as StringValue
+
+        then:
+        argValue.getValue() == "🍺"
+    }
+
+    /*
+        From the RFC:
+        * If {leadingValue} is >= 0xD800 and <= 0xDBFF (a *Leading Surrogate*):
+        * Assert {trailingValue} is >= 0xDC00 and <= 0xDFFF (a *Trailing Surrogate*).
+        * Return ({leadingValue} - 0xD800) × 0x400 + ({trailingValue} - 0xDC00) + 0x10000.
+     */
+    @Ignore
+    def "invalid surrogate pair"() {
+        def input = '''
+              {
+              foo(arg: "\\uD83D\\uDBFF")
+               }
+        '''
+
+        when:
+        Document document = Parser.parse(input)
+
+        then:
+        // TODO: Raise exception
+        false
+    }
+
+    def "invalid unicode code point"() {
+        def input = '''
+              {
+              foo(arg: "\\u{fffffff}")
+               }
+        '''
+
+        when:
+        Document document = Parser.parse(input)
+
+        then:
+        Exception e = thrown(Exception)
+        e.message == "invalid unicode code point"
+    }
+
+    @Ignore
+    def "invalid unpaired surrogate" () {
+        def input = '''
+              {
+              foo(arg: "\\uD83D")
+               }
+        '''
+
+        when:
+        Document document = Parser.parse(input)
+
+        then:
+        // TODO: Discuss whether to raise exception
+        false
+    }
+
+    @Ignore
+    def "invalid code point - too long" () {
+        given:
+        def input = '''"\\u{000000000}"'''
+
+        when:
+        String parsed = StringValueParsing.parseSingleQuotedString(input)
+
+        then:
+        // TODO: Discuss whether to raise exception. How do we want to treat leading zeroes?
+        false
+    }
+
+    /*
+        From the RFC
+        **Byte order mark**
+
+        UnicodeBOM :: "Byte Order Mark (U+FEFF)"
+
+        The *Byte Order Mark* is a special Unicode code point which may appear at the
+        beginning of a file which programs may use to determine the fact that the text
+        stream is Unicode, and what specific encoding has been used.
+
+        As files are often concatenated, a *Byte Order Mark* may appear anywhere within
+        a GraphQL document and is {Ignored}.
+    */
+    @Ignore
+    // TODO: BOM was previously implemented. Do we want to change the prior implementation?
+    def "byte order mark to be ignored" () {
+        // The Byte Order Mark indicates a Unicode stream, and whether the stream is high-endian or low-endian
+        given:
+        def input = '''"hello \\uFEFF\\u4F60\\u597D"'''
+
+        when:
+        String parsed = StringValueParsing.parseSingleQuotedString(input)
+
+        then:
+        parsed == '''hello 你好'''
+    }
+
+    // TODO: How do we want to handle control characters?
+    @Ignore
+    def "escapes zero byte" () {
+        // TODO: This is a test case from the JS implementation. Do we want to implement this case?
+        given:
+        def input = '''"\\x00"'''
+
+        when:
+        String parsed = StringValueParsing.parseSingleQuotedString(input)
+
+        then:
+        parsed == '''\\u0000'''
+    }
+}
+