graphql-java · andimarek · Jul 14, 2021 · Jun 28, 2021 · Jun 29, 2021 · Jun 29, 2021
diff --git a/src/main/antlr/GraphqlCommon.g4 b/src/main/antlr/GraphqlCommon.g4
@@ -120,12 +120,12 @@ fragment BlockStringCharacter:
 ExtendedSourceCharacter;
 
 fragment StringCharacter:
-([\u0009\u0020\u0021] | [\u0023-\u005b] | [\u005d-\u{10FFFF}]) |  // this is SoureCharacter without '"' and '\'
+([\u0009\u0020\u0021] | [\u0023-\u005b] | [\u005d-\u{10FFFF}]) |  // this is SourceCharacter without '"' and '\'
 '\\u' EscapedUnicode  |
 '\\' EscapedCharacter;
 
 fragment EscapedCharacter :  ["\\/bfnrt];
-fragment EscapedUnicode : Hex Hex Hex Hex;
+fragment EscapedUnicode : Hex Hex Hex Hex | '{' Hex+ '}';
 fragment Hex : [0-9a-fA-F];
 
 

diff --git a/src/main/java/graphql/parser/StringValueParsing.java b/src/main/java/graphql/parser/StringValueParsing.java
@@ -30,7 +30,9 @@ public static String removeIndentation(String rawValue) {
         String[] lines = rawValue.split("\\n");
         Integer commonIndent = null;
         for (int i = 0; i < lines.length; i++) {
-            if (i == 0) continue;
+            if (i == 0) {
+                continue;
+            }
             String line = lines[i];
             int length = line.length();
             int indent = leadingWhitespace(line);
@@ -44,7 +46,9 @@ public static String removeIndentation(String rawValue) {
         if (commonIndent != null) {
             for (int i = 0; i < lineList.size(); i++) {
                 String line = lineList.get(i);
-                if (i == 0) continue;
+                if (i == 0) {
+                    continue;
+                }
                 if (line.length() > commonIndent) {
                     line = line.substring(commonIndent);
                     lineList.set(i, line);
@@ -135,10 +139,7 @@ public static String parseSingleQuotedString(String string) {
                     writer.write('\t');
                     continue;
                 case 'u':
-                    String hexStr = string.substring(i + 1, i + 5);
-                    int codepoint = Integer.parseInt(hexStr, 16);
-                    i += 4;
-                    writer.write(codepoint);
+                    i = UnicodeUtil.parseAndWriteUnicode(writer, string, i);
                     continue;
                 default:
                     Assert.assertShouldNeverHappen();

diff --git a/src/main/java/graphql/parser/UnicodeUtil.java b/src/main/java/graphql/parser/UnicodeUtil.java
@@ -0,0 +1,113 @@
+package graphql.parser;
+
+import graphql.Internal;
+
+import java.io.IOException;
+import java.io.StringWriter;
+
+import static graphql.Assert.assertShouldNeverHappen;
+
+/**
+ * Contains Unicode helpers for parsing StringValue types in the grammar
+ */
+@Internal
+public class UnicodeUtil {
+    public static int MAX_UNICODE_CODE_POINT = 0x10FFFF;
+    public static int LEADING_SURROGATE_LOWER_BOUND = 0xD800;
+    public static int LEADING_SURROGATE_UPPER_BOUND = 0xDBFF;
+    public static int TRAILING_SURROGATE_LOWER_BOUND = 0xDC00;
+    public static int TRAILING_SURROGATE_UPPER_BOUND = 0xDFFF;
+
+    public static int parseAndWriteUnicode(StringWriter writer, String string, int i) {
+        // Unicode code points can either be:
+        //  1. Unbraced: four hex characters in the form \\u597D, or
+        //  2. Braced: any number of hex characters surrounded by braces in the form \\u{1F37A}
+
+        // Extract the code point hex digits. Index i points to 'u'
+        int startIndex = isBracedEscape(string, i) ? i + 2 : i + 1;
+        int endIndexExclusive = getEndIndexExclusive(string, i);
+        // Index for parser to continue at, the last character of the escaped unicode character. Either } or hex digit
+        int continueIndex = isBracedEscape(string, i) ? endIndexExclusive : endIndexExclusive - 1;
+
+        String hexStr = string.substring(startIndex, endIndexExclusive);
+        Integer codePoint = Integer.parseInt(hexStr, 16);
+
+        if (isTrailingSurrogateValue(codePoint)) {
+            throw new InvalidSyntaxException(null, "Invalid unicode - trailing surrogate must be preceded with a leading surrogate -", null, string.substring(i - 1, continueIndex + 1), null);
+        } else if (isLeadingSurrogateValue(codePoint)) {
+            if (!isEscapedUnicode(string, continueIndex + 1)) {
+                throw new InvalidSyntaxException(null, "Invalid unicode - leading surrogate must be followed by a trailing surrogate -", null, string.substring(i - 1, continueIndex + 1), null);
+            }
+
+            // Shift parser ahead to 'u' in second escaped Unicode character
+            i = continueIndex + 2;
+            int trailingStartIndex = isBracedEscape(string, i) ? i + 2 : i + 1;
+            int trailingEndIndexExclusive = getEndIndexExclusive(string, i);
+            String trailingHexStr = string.substring(trailingStartIndex, trailingEndIndexExclusive);
+            Integer trailingCodePoint = Integer.parseInt(trailingHexStr, 16);
+            continueIndex = isBracedEscape(string, i) ? trailingEndIndexExclusive : trailingEndIndexExclusive - 1;
+
+            if (isTrailingSurrogateValue(trailingCodePoint)) {
+                writeCodePoint(writer, codePoint);
+                writeCodePoint(writer, trailingCodePoint);
+                return continueIndex;
+            }
+
+            throw new InvalidSyntaxException(null, "Invalid unicode - leading surrogate must be followed by a trailing surrogate -", null, string.substring(i - 1, continueIndex + 1), null);
+        } else if (isValidUnicodeCodePoint(codePoint)) {
+            writeCodePoint(writer, codePoint);
+            return continueIndex;
+        }
+
+        throw new InvalidSyntaxException(null, "Invalid unicode - not a valid code point -", null, string.substring(i - 1, continueIndex + 1), null);
+    }
+
+    private static int getEndIndexExclusive(String string, int i) {
+        // Unbraced case, with exactly 4 hex digits
+        if (string.length() > i + 5 && !isBracedEscape(string, i)) {
+            return i + 5;
+        }
+
+        // Braced case, with any number of hex digits
+        int endIndexExclusive = i + 2;
+        do {
+            if (endIndexExclusive + 1 >= string.length()) {
+                throw new InvalidSyntaxException(null, "Invalid unicode - incorrectly formatted escape -", null, string.substring(i - 1, endIndexExclusive), null);
+            }
+        } while (string.charAt(++endIndexExclusive) != '}');
+
+        return endIndexExclusive;
+    }
+
+    private static boolean isValidUnicodeCodePoint(int value) {
+        return value <= MAX_UNICODE_CODE_POINT;
+    }
+
+    private static boolean isEscapedUnicode(String string, int index) {
+        if (index + 1 >= string.length()) {
+            return false;
+        }
+        return string.charAt(index) == '\\' && string.charAt(index + 1) == 'u';
+    }
+
+    private static boolean isLeadingSurrogateValue(int value) {
+        return LEADING_SURROGATE_LOWER_BOUND <= value && value <= LEADING_SURROGATE_UPPER_BOUND;
+    }
+
+    private static boolean isTrailingSurrogateValue(int value) {
+        return TRAILING_SURROGATE_LOWER_BOUND <= value && value <= TRAILING_SURROGATE_UPPER_BOUND;
+    }
+
+    private static void writeCodePoint(StringWriter writer, int codepoint) {
+        char[] chars = Character.toChars(codepoint);
+        try {
+            writer.write(chars);
+        } catch (IOException e) {
+            assertShouldNeverHappen();
+        }
+    }
+
+    private static boolean isBracedEscape(String string, int i) {
+        return string.charAt(i + 1) == '{';
+    }
+}
diff --git a/src/test/groovy/graphql/parser/StringValueParsingTest.groovy b/src/test/groovy/graphql/parser/StringValueParsingTest.groovy
@@ -40,8 +40,7 @@ class StringValueParsingTest extends Specification {
         parsed == '''"'''
     }
 
-    def "parsing emoji should work"() {
-        // needs surrogate pairs for this emoji
+    def "parsing beer stein as surrogate pair should work"() {
         given:
         def input = '''"\\ud83c\\udf7a"'''
 
@@ -52,18 +51,17 @@ class StringValueParsingTest extends Specification {
         parsed == '''🍺''' // contains the beer icon 	U+1F37A  : http://www.charbase.com/1f37a-unicode-beer-mug
     }
 
-    def "parsing simple unicode should work"() {
+    def "parsing simple unicode should work - Basic Multilingual Plane (BMP)"() {
         given:
-        def input = '''"\\u56fe"'''
+        def input = '''"\\u5564\\u9152"'''
 
         when:
         String parsed = StringValueParsing.parseSingleQuotedString(input)
 
         then:
-        parsed == '''图'''
+        parsed == '''啤酒'''
     }
 
-
     def "parsing triple quoted string should work"() {
         given:
         def input = '''"""triple quoted"""'''