From a0785c6ffadb847eef7f271c9a43ff9e4068e1b2 Mon Sep 17 00:00:00 2001 From: dondonz <13839920+dondonz@users.noreply.github.com> Date: Mon, 28 Jun 2021 11:14:44 +1000 Subject: [PATCH 01/15] Add tests to support full Unicode parser --- .../parser/StringValueParsingTest.groovy | 10 +- .../parser/UnicodeUtilParserTest.groovy | 191 ++++++++++++++++++ 2 files changed, 195 insertions(+), 6 deletions(-) create mode 100644 src/test/groovy/graphql/parser/UnicodeUtilParserTest.groovy diff --git a/src/test/groovy/graphql/parser/StringValueParsingTest.groovy b/src/test/groovy/graphql/parser/StringValueParsingTest.groovy index 8044d1cec7..5543dbc339 100644 --- a/src/test/groovy/graphql/parser/StringValueParsingTest.groovy +++ b/src/test/groovy/graphql/parser/StringValueParsingTest.groovy @@ -40,8 +40,7 @@ class StringValueParsingTest extends Specification { parsed == '''"''' } - def "parsing emoji should work"() { - // needs surrogate pairs for this emoji + def "parsing beer stein as surrogate pair should work"() { given: def input = '''"\\ud83c\\udf7a"''' @@ -52,18 +51,17 @@ class StringValueParsingTest extends Specification { parsed == '''🍺''' // contains the beer icon U+1F37A : http://www.charbase.com/1f37a-unicode-beer-mug } - def "parsing simple unicode should work"() { + def "parsing simple unicode should work - Basic Multilingual Plane (BMP)"() { given: - def input = '''"\\u56fe"''' + def input = '''"\\u5564\\u9152"''' when: String parsed = StringValueParsing.parseSingleQuotedString(input) then: - parsed == '''图''' + parsed == '''啤酒''' } - def "parsing triple quoted string should work"() { given: def input = '''"""triple quoted"""''' diff --git a/src/test/groovy/graphql/parser/UnicodeUtilParserTest.groovy b/src/test/groovy/graphql/parser/UnicodeUtilParserTest.groovy new file mode 100644 index 0000000000..3a5cba4bcf --- /dev/null +++ b/src/test/groovy/graphql/parser/UnicodeUtilParserTest.groovy @@ -0,0 +1,191 @@ +package graphql.parser + +import graphql.language.Document +import graphql.language.Field +import graphql.language.OperationDefinition +import graphql.language.StringValue +import spock.lang.Specification + +class UnicodeUtilParserTest extends Specification { + /* + Implements RFC to support full Unicode + Original RFC https://github.com/graphql/graphql-spec/issues/687 + RFC spec text https://github.com/graphql/graphql-spec/pull/849 + RFC JS implementation https://github.com/graphql/graphql-js/pull/3117 + + TL;DR + Previously, valid SourceCharacters included Unicode scalar values up to and including U+FFFF - the Basic Multilingual Plane (BMP) + Now this is changing to incorporate all Unicode scalar values + Assert {value} is a within the *Unicode scalar value* range (>= 0x0000 and <= 0xD7FF or >= 0xE000 and <= 0x10FFFF). + Practically this means you can have your beer emoji (U+1F37A) in queries as \\u{1F37A} + */ + + // With this RFC, code points outside the Basic Multilingual Plane can be parsed. For example, emojis + // Previously emojis could only be parsed with surrogate pairs. Now they can be parsed with the code point directly + def "parsing beer stein as escaped unicode"() { + given: + def input = '''"\\u{1F37A} hello"''' + + when: + String parsed = StringValueParsing.parseSingleQuotedString(input) + + then: + parsed == '''🍺 hello''' // contains the beer icon U+1F37A : http://www.charbase.com/1f37a-unicode-beer-mug + } + + def "parsing beer mug non escaped"() { + given: + def input = '''"🍺 hello"''' + + when: + String parsed = StringValueParsing.parseSingleQuotedString(input) + + then: + parsed == '''🍺 hello''' // contains the beer icon U+1F37A : http://www.charbase.com/1f37a-unicode-beer-mug + } + + def "allow braced escaped unicode"() { + def input = ''' + { + foo(arg: "\\u{1F37A}") + } + ''' + + when: + Document document = Parser.parse(input) + OperationDefinition operationDefinition = (document.definitions[0] as OperationDefinition) + def field = operationDefinition.getSelectionSet().getSelections()[0] as Field + def argValue = field.arguments[0].value as StringValue + + then: + argValue.getValue() == "🍺" + } + + /* + From the RFC: + For legacy reasons, a *supplementary character* may be escaped by two + fixed-width unicode escape sequences forming a *surrogate pair*. For example + the input `"\uD83D\uDCA9"` is a valid {StringValue} which represents the same + Unicode text as `"\u{1F4A9}"`. While this legacy form is allowed, it should be + avoided as a variable-width unicode escape sequence is a clearer way to encode + such code points. + */ + def "allow surrogate pairs escaped unicode"() { + def input = ''' + { + foo(arg: "\\ud83c\\udf7a") + } + ''' + + when: + Document document = Parser.parse(input) + OperationDefinition operationDefinition = (document.definitions[0] as OperationDefinition) + def field = operationDefinition.getSelectionSet().getSelections()[0] as Field + def argValue = field.arguments[0].value as StringValue + + then: + argValue.getValue() == "🍺" + } + + /* + From the RFC: + * If {leadingValue} is >= 0xD800 and <= 0xDBFF (a *Leading Surrogate*): + * Assert {trailingValue} is >= 0xDC00 and <= 0xDFFF (a *Trailing Surrogate*). + * Return ({leadingValue} - 0xD800) × 0x400 + ({trailingValue} - 0xDC00) + 0x10000. + */ + def "invalid surrogate pair"() { + def input = ''' + { + foo(arg: "\\uD83D\\uDBFF") + } + ''' + + when: + Document document = Parser.parse(input) + + then: + // TODO: Raise exception + false + } + + def "invalid unicode code point"() { + def input = ''' + { + foo(arg: "\\u{fffffff}") + } + ''' + + when: + Document document = Parser.parse(input) + + then: + // TODO: Raise exception + false + } + + def "invalid unpaired surrogate" () { + def input = ''' + { + foo(arg: "\\uD83D") + } + ''' + + when: + Document document = Parser.parse(input) + + then: + // TODO: Discuss whether to raise exception + false + } + + def "invalid code point - too long" () { + given: + def input = '''"\\u{000000000}"''' + + when: + String parsed = StringValueParsing.parseSingleQuotedString(input) + + then: + // TODO: Discuss whether to raise exception + false + } + + /* + From the RFC + **Byte order mark** + + UnicodeBOM :: "Byte Order Mark (U+FEFF)" + + The *Byte Order Mark* is a special Unicode code point which may appear at the + beginning of a file which programs may use to determine the fact that the text + stream is Unicode, and what specific encoding has been used. + + As files are often concatenated, a *Byte Order Mark* may appear anywhere within + a GraphQL document and is {Ignored}. + */ + def "byte order mark to be ignored" () { + // The Byte Order Mark indicates a Unicode stream, and whether the stream is high-endian or low-endian + given: + def input = '''"hello \\uFEFF\\u4F60\\u597D"''' + + when: + String parsed = StringValueParsing.parseSingleQuotedString(input) + + then: + parsed == '''hello 你好''' + } + + // TODO: How do we want to handle control characters? + def "escapes zero byte" () { + // TODO: This is a test case from the JS implementation. Do we want to implement this case? + given: + def input = '''"\\x00"''' + + when: + String parsed = StringValueParsing.parseSingleQuotedString(input) + + then: + parsed == '''\\u0000''' + } +} + From 5594036820d34413d204631c336ba4c4e4b8b4f9 Mon Sep 17 00:00:00 2001 From: dondonz <13839920+dondonz@users.noreply.github.com> Date: Tue, 29 Jun 2021 10:52:48 +1000 Subject: [PATCH 02/15] Fix Groovy Unicode parser issue --- .../graphql/parser/UnicodeUtilParserTest.groovy | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/src/test/groovy/graphql/parser/UnicodeUtilParserTest.groovy b/src/test/groovy/graphql/parser/UnicodeUtilParserTest.groovy index 3a5cba4bcf..1c1af5098f 100644 --- a/src/test/groovy/graphql/parser/UnicodeUtilParserTest.groovy +++ b/src/test/groovy/graphql/parser/UnicodeUtilParserTest.groovy @@ -4,6 +4,7 @@ import graphql.language.Document import graphql.language.Field import graphql.language.OperationDefinition import graphql.language.StringValue +import spock.lang.Ignore import spock.lang.Specification class UnicodeUtilParserTest extends Specification { @@ -22,6 +23,7 @@ class UnicodeUtilParserTest extends Specification { // With this RFC, code points outside the Basic Multilingual Plane can be parsed. For example, emojis // Previously emojis could only be parsed with surrogate pairs. Now they can be parsed with the code point directly + @Ignore def "parsing beer stein as escaped unicode"() { given: def input = '''"\\u{1F37A} hello"''' @@ -33,6 +35,7 @@ class UnicodeUtilParserTest extends Specification { parsed == '''🍺 hello''' // contains the beer icon U+1F37A : http://www.charbase.com/1f37a-unicode-beer-mug } + @Ignore def "parsing beer mug non escaped"() { given: def input = '''"🍺 hello"''' @@ -44,6 +47,7 @@ class UnicodeUtilParserTest extends Specification { parsed == '''🍺 hello''' // contains the beer icon U+1F37A : http://www.charbase.com/1f37a-unicode-beer-mug } + @Ignore def "allow braced escaped unicode"() { def input = ''' { @@ -65,11 +69,12 @@ class UnicodeUtilParserTest extends Specification { From the RFC: For legacy reasons, a *supplementary character* may be escaped by two fixed-width unicode escape sequences forming a *surrogate pair*. For example - the input `"\uD83D\uDCA9"` is a valid {StringValue} which represents the same - Unicode text as `"\u{1F4A9}"`. While this legacy form is allowed, it should be + the input `"\\uD83D\\uDCA9"` is a valid {StringValue} which represents the same + Unicode text as `"\\u{1F4A9}"`. While this legacy form is allowed, it should be avoided as a variable-width unicode escape sequence is a clearer way to encode such code points. */ + @Ignore def "allow surrogate pairs escaped unicode"() { def input = ''' { @@ -93,6 +98,7 @@ class UnicodeUtilParserTest extends Specification { * Assert {trailingValue} is >= 0xDC00 and <= 0xDFFF (a *Trailing Surrogate*). * Return ({leadingValue} - 0xD800) × 0x400 + ({trailingValue} - 0xDC00) + 0x10000. */ + @Ignore def "invalid surrogate pair"() { def input = ''' { @@ -108,6 +114,7 @@ class UnicodeUtilParserTest extends Specification { false } + @Ignore def "invalid unicode code point"() { def input = ''' { @@ -123,6 +130,7 @@ class UnicodeUtilParserTest extends Specification { false } + @Ignore def "invalid unpaired surrogate" () { def input = ''' { @@ -138,6 +146,7 @@ class UnicodeUtilParserTest extends Specification { false } + @Ignore def "invalid code point - too long" () { given: def input = '''"\\u{000000000}"''' @@ -163,6 +172,7 @@ class UnicodeUtilParserTest extends Specification { As files are often concatenated, a *Byte Order Mark* may appear anywhere within a GraphQL document and is {Ignored}. */ + @Ignore def "byte order mark to be ignored" () { // The Byte Order Mark indicates a Unicode stream, and whether the stream is high-endian or low-endian given: @@ -176,6 +186,7 @@ class UnicodeUtilParserTest extends Specification { } // TODO: How do we want to handle control characters? + @Ignore def "escapes zero byte" () { // TODO: This is a test case from the JS implementation. Do we want to implement this case? given: From 5b12cbeff757c7280177375814b92a4387db152f Mon Sep 17 00:00:00 2001 From: dondonz <13839920+dondonz@users.noreply.github.com> Date: Tue, 29 Jun 2021 11:10:37 +1000 Subject: [PATCH 03/15] Add full Unicode to parser, the happy path --- src/main/antlr/GraphqlCommon.g4 | 4 +- .../graphql/parser/StringValueParsing.java | 13 ++--- src/main/java/graphql/parser/UnicodeUtil.java | 51 +++++++++++++++++++ .../parser/UnicodeUtilParserTest.groovy | 4 -- 4 files changed, 60 insertions(+), 12 deletions(-) create mode 100644 src/main/java/graphql/parser/UnicodeUtil.java diff --git a/src/main/antlr/GraphqlCommon.g4 b/src/main/antlr/GraphqlCommon.g4 index e7d500cc71..5ace483ff9 100644 --- a/src/main/antlr/GraphqlCommon.g4 +++ b/src/main/antlr/GraphqlCommon.g4 @@ -120,12 +120,12 @@ fragment BlockStringCharacter: ExtendedSourceCharacter; fragment StringCharacter: -([\u0009\u0020\u0021] | [\u0023-\u005b] | [\u005d-\u{10FFFF}]) | // this is SoureCharacter without '"' and '\' +([\u0009\u0020\u0021] | [\u0023-\u005b] | [\u005d-\u{10FFFF}]) | // this is SourceCharacter without '"' and '\' '\\u' EscapedUnicode | '\\' EscapedCharacter; fragment EscapedCharacter : ["\\/bfnrt]; -fragment EscapedUnicode : Hex Hex Hex Hex; +fragment EscapedUnicode : Hex Hex Hex Hex | '{' Hex+ '}'; fragment Hex : [0-9a-fA-F]; diff --git a/src/main/java/graphql/parser/StringValueParsing.java b/src/main/java/graphql/parser/StringValueParsing.java index a3c598c2d6..83b214cab1 100644 --- a/src/main/java/graphql/parser/StringValueParsing.java +++ b/src/main/java/graphql/parser/StringValueParsing.java @@ -30,7 +30,9 @@ public static String removeIndentation(String rawValue) { String[] lines = rawValue.split("\\n"); Integer commonIndent = null; for (int i = 0; i < lines.length; i++) { - if (i == 0) continue; + if (i == 0) { + continue; + } String line = lines[i]; int length = line.length(); int indent = leadingWhitespace(line); @@ -44,7 +46,9 @@ public static String removeIndentation(String rawValue) { if (commonIndent != null) { for (int i = 0; i < lineList.size(); i++) { String line = lineList.get(i); - if (i == 0) continue; + if (i == 0) { + continue; + } if (line.length() > commonIndent) { line = line.substring(commonIndent); lineList.set(i, line); @@ -135,10 +139,7 @@ public static String parseSingleQuotedString(String string) { writer.write('\t'); continue; case 'u': - String hexStr = string.substring(i + 1, i + 5); - int codepoint = Integer.parseInt(hexStr, 16); - i += 4; - writer.write(codepoint); + i = UnicodeUtil.parseAndWriteUnicode(writer, string, i); continue; default: Assert.assertShouldNeverHappen(); diff --git a/src/main/java/graphql/parser/UnicodeUtil.java b/src/main/java/graphql/parser/UnicodeUtil.java new file mode 100644 index 0000000000..0574ddb6a4 --- /dev/null +++ b/src/main/java/graphql/parser/UnicodeUtil.java @@ -0,0 +1,51 @@ +package graphql.parser; + +import graphql.Assert; +import graphql.Internal; + +import java.io.IOException; +import java.io.StringWriter; + +/** + * Contains Unicode helpers for parsing StringValue types in the grammar + */ +@Internal +public class UnicodeUtil { + public static int parseAndWriteUnicode(StringWriter writer, String string, int i) { + // Unicode characters can either be: + // - four hex characters in the form \\u597D, or + // - any number of hex characters surrounded by a brace in the form \\u{1F37A} + + // Four hex character only case \\u597D, for code points in the Basic Multilingual Plane (BMP) + if (isNotBracedEscape(string, i)) { + String hexStr = string.substring(i + 1, i + 5); + int codepoint = Integer.parseInt(hexStr, 16); + writer.write(codepoint); + return i + 4; + // TODO error checking of invalid values + } + + // Any number of hex characters e.g. \\u{1F37A}, which allows code points outside the Basic Multilingual Plane (BMP) + int startIx = i + 2; + int endIndexExclusive = startIx; + do { + if (endIndexExclusive + 1 >= string.length()) { + throw new RuntimeException("invalid unicode encoding"); + } + } while (string.charAt(++endIndexExclusive) != '}'); + + String hexStr = string.substring(startIx, endIndexExclusive); + char[] chars = Character.toChars(Integer.parseInt(hexStr, 16)); + try { + writer.write(chars); + } catch (IOException e) { + return Assert.assertShouldNeverHappen(); + } + return endIndexExclusive; + // TODO error checking of invalid values + } + + private static boolean isNotBracedEscape(String string, int i) { + return string.charAt(i + 1) != '{'; + } +} diff --git a/src/test/groovy/graphql/parser/UnicodeUtilParserTest.groovy b/src/test/groovy/graphql/parser/UnicodeUtilParserTest.groovy index 1c1af5098f..583e9b26e5 100644 --- a/src/test/groovy/graphql/parser/UnicodeUtilParserTest.groovy +++ b/src/test/groovy/graphql/parser/UnicodeUtilParserTest.groovy @@ -23,7 +23,6 @@ class UnicodeUtilParserTest extends Specification { // With this RFC, code points outside the Basic Multilingual Plane can be parsed. For example, emojis // Previously emojis could only be parsed with surrogate pairs. Now they can be parsed with the code point directly - @Ignore def "parsing beer stein as escaped unicode"() { given: def input = '''"\\u{1F37A} hello"''' @@ -35,7 +34,6 @@ class UnicodeUtilParserTest extends Specification { parsed == '''🍺 hello''' // contains the beer icon U+1F37A : http://www.charbase.com/1f37a-unicode-beer-mug } - @Ignore def "parsing beer mug non escaped"() { given: def input = '''"🍺 hello"''' @@ -47,7 +45,6 @@ class UnicodeUtilParserTest extends Specification { parsed == '''🍺 hello''' // contains the beer icon U+1F37A : http://www.charbase.com/1f37a-unicode-beer-mug } - @Ignore def "allow braced escaped unicode"() { def input = ''' { @@ -74,7 +71,6 @@ class UnicodeUtilParserTest extends Specification { avoided as a variable-width unicode escape sequence is a clearer way to encode such code points. */ - @Ignore def "allow surrogate pairs escaped unicode"() { def input = ''' { From dc349a0b7f75061680a626d6ea29f69c5a3e6501 Mon Sep 17 00:00:00 2001 From: dondonz <13839920+dondonz@users.noreply.github.com> Date: Tue, 29 Jun 2021 11:48:41 +1000 Subject: [PATCH 04/15] Add maximum unicode value check --- src/main/java/graphql/parser/UnicodeUtil.java | 45 ++++++++++++------- .../parser/UnicodeUtilParserTest.groovy | 9 ++-- 2 files changed, 34 insertions(+), 20 deletions(-) diff --git a/src/main/java/graphql/parser/UnicodeUtil.java b/src/main/java/graphql/parser/UnicodeUtil.java index 0574ddb6a4..d90ccdbfcb 100644 --- a/src/main/java/graphql/parser/UnicodeUtil.java +++ b/src/main/java/graphql/parser/UnicodeUtil.java @@ -11,6 +11,8 @@ */ @Internal public class UnicodeUtil { + public static int MAX_UNICODE_CODE_POINT = 0x10FFFF; + public static int parseAndWriteUnicode(StringWriter writer, String string, int i) { // Unicode characters can either be: // - four hex characters in the form \\u597D, or @@ -23,29 +25,40 @@ public static int parseAndWriteUnicode(StringWriter writer, String string, int i writer.write(codepoint); return i + 4; // TODO error checking of invalid values - } + } else { + // Any number of hex characters e.g. \\u{1F37A}, which allows code points outside the Basic Multilingual Plane (BMP) + int startIx = i + 2; + int endIndexExclusive = startIx; + do { + if (endIndexExclusive + 1 >= string.length()) { + throw new RuntimeException("invalid unicode encoding"); + } + } while (string.charAt(++endIndexExclusive) != '}'); - // Any number of hex characters e.g. \\u{1F37A}, which allows code points outside the Basic Multilingual Plane (BMP) - int startIx = i + 2; - int endIndexExclusive = startIx; - do { - if (endIndexExclusive + 1 >= string.length()) { - throw new RuntimeException("invalid unicode encoding"); + String hexStr = string.substring(startIx, endIndexExclusive); + Integer hexValue = Integer.parseInt(hexStr, 16); + if (isValidUnicodeCodePoint(hexValue)) { + char[] chars = Character.toChars(hexValue); + try { + writer.write(chars); + } catch (IOException e) { + return Assert.assertShouldNeverHappen(); + } + return endIndexExclusive; + } else { + throw new RuntimeException("invalid unicode code point"); } - } while (string.charAt(++endIndexExclusive) != '}'); - - String hexStr = string.substring(startIx, endIndexExclusive); - char[] chars = Character.toChars(Integer.parseInt(hexStr, 16)); - try { - writer.write(chars); - } catch (IOException e) { - return Assert.assertShouldNeverHappen(); } - return endIndexExclusive; +// Assert.assertShouldNeverHappen(); // TODO error checking of invalid values } private static boolean isNotBracedEscape(String string, int i) { return string.charAt(i + 1) != '{'; } + + private static boolean isValidUnicodeCodePoint(Integer value) { + // TODO: Add bad surrogate checks + return value < MAX_UNICODE_CODE_POINT; + } } diff --git a/src/test/groovy/graphql/parser/UnicodeUtilParserTest.groovy b/src/test/groovy/graphql/parser/UnicodeUtilParserTest.groovy index 583e9b26e5..0373a35beb 100644 --- a/src/test/groovy/graphql/parser/UnicodeUtilParserTest.groovy +++ b/src/test/groovy/graphql/parser/UnicodeUtilParserTest.groovy @@ -4,6 +4,7 @@ import graphql.language.Document import graphql.language.Field import graphql.language.OperationDefinition import graphql.language.StringValue +import graphql.schema.validation.InvalidSchemaException import spock.lang.Ignore import spock.lang.Specification @@ -110,7 +111,6 @@ class UnicodeUtilParserTest extends Specification { false } - @Ignore def "invalid unicode code point"() { def input = ''' { @@ -122,8 +122,8 @@ class UnicodeUtilParserTest extends Specification { Document document = Parser.parse(input) then: - // TODO: Raise exception - false + Exception e = thrown(Exception) + e.message == "invalid unicode code point" } @Ignore @@ -151,7 +151,7 @@ class UnicodeUtilParserTest extends Specification { String parsed = StringValueParsing.parseSingleQuotedString(input) then: - // TODO: Discuss whether to raise exception + // TODO: Discuss whether to raise exception. How do we want to treat leading zeroes? false } @@ -169,6 +169,7 @@ class UnicodeUtilParserTest extends Specification { a GraphQL document and is {Ignored}. */ @Ignore + // TODO: BOM was previously implemented. Do we want to change the prior implementation? def "byte order mark to be ignored" () { // The Byte Order Mark indicates a Unicode stream, and whether the stream is high-endian or low-endian given: From c719779320b03beb86fd03696bd647ed3ac0a79b Mon Sep 17 00:00:00 2001 From: dondonz <13839920+dondonz@users.noreply.github.com> Date: Tue, 29 Jun 2021 11:57:57 +1000 Subject: [PATCH 05/15] Fix typo --- src/main/java/graphql/parser/UnicodeUtil.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/graphql/parser/UnicodeUtil.java b/src/main/java/graphql/parser/UnicodeUtil.java index d90ccdbfcb..b1ab251e83 100644 --- a/src/main/java/graphql/parser/UnicodeUtil.java +++ b/src/main/java/graphql/parser/UnicodeUtil.java @@ -59,6 +59,6 @@ private static boolean isNotBracedEscape(String string, int i) { private static boolean isValidUnicodeCodePoint(Integer value) { // TODO: Add bad surrogate checks - return value < MAX_UNICODE_CODE_POINT; + return value <= MAX_UNICODE_CODE_POINT; } } From ecfffc4a4009774a4e9a6082822c9771a37266d9 Mon Sep 17 00:00:00 2001 From: dondonz <13839920+dondonz@users.noreply.github.com> Date: Thu, 8 Jul 2021 14:46:49 +1000 Subject: [PATCH 06/15] Add tests --- .../parser/UnicodeUtilParserTest.groovy | 214 +++++++++++------- 1 file changed, 137 insertions(+), 77 deletions(-) diff --git a/src/test/groovy/graphql/parser/UnicodeUtilParserTest.groovy b/src/test/groovy/graphql/parser/UnicodeUtilParserTest.groovy index 0373a35beb..76a68b257a 100644 --- a/src/test/groovy/graphql/parser/UnicodeUtilParserTest.groovy +++ b/src/test/groovy/graphql/parser/UnicodeUtilParserTest.groovy @@ -4,26 +4,22 @@ import graphql.language.Document import graphql.language.Field import graphql.language.OperationDefinition import graphql.language.StringValue -import graphql.schema.validation.InvalidSchemaException -import spock.lang.Ignore import spock.lang.Specification class UnicodeUtilParserTest extends Specification { /* - Implements RFC to support full Unicode - Original RFC https://github.com/graphql/graphql-spec/issues/687 - RFC spec text https://github.com/graphql/graphql-spec/pull/849 - RFC JS implementation https://github.com/graphql/graphql-js/pull/3117 - - TL;DR - Previously, valid SourceCharacters included Unicode scalar values up to and including U+FFFF - the Basic Multilingual Plane (BMP) - Now this is changing to incorporate all Unicode scalar values - Assert {value} is a within the *Unicode scalar value* range (>= 0x0000 and <= 0xD7FF or >= 0xE000 and <= 0x10FFFF). - Practically this means you can have your beer emoji (U+1F37A) in queries as \\u{1F37A} + Implements RFC to support full Unicode https://github.com/graphql/graphql-spec/pull/849 + + Key changes + * SourceCharacters now include all Unicode scalar values. Previously only included up to U+FFFF (Basic Multilingual Plane). + * SourceCharacters now include control characters. Previously certain control characters were excluded. + * Surrogate pair validation added. + + Note that "unescaped" Unicode characters such as 🍺 are handled by ANTLR grammar. + "Escaped" Unicode characters such as \\u{1F37A} are handled by StringValueParsing. */ - // With this RFC, code points outside the Basic Multilingual Plane can be parsed. For example, emojis - // Previously emojis could only be parsed with surrogate pairs. Now they can be parsed with the code point directly + // With this RFC, escaped code points outside the Basic Multilingual Plane (e.g. emojis) can be parsed. def "parsing beer stein as escaped unicode"() { given: def input = '''"\\u{1F37A} hello"''' @@ -35,7 +31,7 @@ class UnicodeUtilParserTest extends Specification { parsed == '''🍺 hello''' // contains the beer icon U+1F37A : http://www.charbase.com/1f37a-unicode-beer-mug } - def "parsing beer mug non escaped"() { + def "parsing beer stein without escaping"() { given: def input = '''"🍺 hello"''' @@ -47,6 +43,7 @@ class UnicodeUtilParserTest extends Specification { } def "allow braced escaped unicode"() { + given: def input = ''' { foo(arg: "\\u{1F37A}") @@ -60,7 +57,7 @@ class UnicodeUtilParserTest extends Specification { def argValue = field.arguments[0].value as StringValue then: - argValue.getValue() == "🍺" + argValue.getValue() == "🍺" // contains the beer icon U+1F37A : http://www.charbase.com/1f37a-unicode-beer-mug } /* @@ -73,6 +70,7 @@ class UnicodeUtilParserTest extends Specification { such code points. */ def "allow surrogate pairs escaped unicode"() { + given: def input = ''' { foo(arg: "\\ud83c\\udf7a") @@ -86,114 +84,176 @@ class UnicodeUtilParserTest extends Specification { def argValue = field.arguments[0].value as StringValue then: - argValue.getValue() == "🍺" + argValue.getValue() == "🍺" // contains the beer icon U+1F37A : http://www.charbase.com/1f37a-unicode-beer-mug } /* - From the RFC: + Valid surrogate pair combinations (from the RFC): * If {leadingValue} is >= 0xD800 and <= 0xDBFF (a *Leading Surrogate*): * Assert {trailingValue} is >= 0xDC00 and <= 0xDFFF (a *Trailing Surrogate*). - * Return ({leadingValue} - 0xD800) × 0x400 + ({trailingValue} - 0xDC00) + 0x10000. */ - @Ignore - def "invalid surrogate pair"() { - def input = ''' - { - foo(arg: "\\uD83D\\uDBFF") - } - ''' + def "invalid surrogate pair - no trailing value"() { + given: + def input = '''"\\uD83D hello"''' when: - Document document = Parser.parse(input) + StringValueParsing.parseSingleQuotedString(input) then: - // TODO: Raise exception - false + RuntimeException e = thrown(RuntimeException) + e.message == "Invalid unicode" } - def "invalid unicode code point"() { - def input = ''' - { - foo(arg: "\\u{fffffff}") - } - ''' + def "invalid surrogate pair - end of string"() { + given: + def input = '''"\\uD83D"''' when: - Document document = Parser.parse(input) + StringValueParsing.parseSingleQuotedString(input) then: - Exception e = thrown(Exception) - e.message == "invalid unicode code point" + RuntimeException e = thrown(RuntimeException) + e.message == "Invalid unicode" } - @Ignore - def "invalid unpaired surrogate" () { - def input = ''' - { - foo(arg: "\\uD83D") - } - ''' + def "invalid surrogate pair - invalid trailing value"() { + given: + def input = '''"\\uD83D\\uDBFF"''' when: - Document document = Parser.parse(input) + StringValueParsing.parseSingleQuotedString(input) then: - // TODO: Discuss whether to raise exception - false + RuntimeException e = thrown(RuntimeException) + e.message == "Invalid unicode" } - @Ignore - def "invalid code point - too long" () { + def "invalid surrogate pair - no leading value"() { given: - def input = '''"\\u{000000000}"''' + def input = '''"\\uDC00"''' when: - String parsed = StringValueParsing.parseSingleQuotedString(input) + StringValueParsing.parseSingleQuotedString(input) then: - // TODO: Discuss whether to raise exception. How do we want to treat leading zeroes? - false + RuntimeException e = thrown(RuntimeException) + e.message == "Invalid unicode" } - /* - From the RFC - **Byte order mark** + def "invalid surrogate pair - invalid leading value"() { + given: + def input = '''"\\uD700\\uDC00"''' - UnicodeBOM :: "Byte Order Mark (U+FEFF)" + when: + StringValueParsing.parseSingleQuotedString(input) - The *Byte Order Mark* is a special Unicode code point which may appear at the - beginning of a file which programs may use to determine the fact that the text - stream is Unicode, and what specific encoding has been used. + then: + RuntimeException e = thrown(RuntimeException) + e.message == "Invalid unicode" + } - As files are often concatenated, a *Byte Order Mark* may appear anywhere within - a GraphQL document and is {Ignored}. - */ - @Ignore - // TODO: BOM was previously implemented. Do we want to change the prior implementation? - def "byte order mark to be ignored" () { - // The Byte Order Mark indicates a Unicode stream, and whether the stream is high-endian or low-endian + def "valid surrogate pair - leading code with braces"() { given: - def input = '''"hello \\uFEFF\\u4F60\\u597D"''' + def input = '''"hello \\u{d83c}\\udf7a"''' when: String parsed = StringValueParsing.parseSingleQuotedString(input) then: - parsed == '''hello 你好''' + parsed == '''hello 🍺''' // contains the beer icon U+1F37 A : http://www.charbase.com/1f37a-unicode-beer-mug } - // TODO: How do we want to handle control characters? - @Ignore - def "escapes zero byte" () { - // TODO: This is a test case from the JS implementation. Do we want to implement this case? + def "valid surrogate pair - trailing code with braces"() { given: - def input = '''"\\x00"''' + def input = '''"hello \\ud83c\\u{df7a}"''' when: String parsed = StringValueParsing.parseSingleQuotedString(input) then: - parsed == '''\\u0000''' + parsed == '''hello 🍺''' // contains the beer icon U+1F37A : http://www.charbase.com/1f37a-unicode-beer-mug } -} + def "valid surrogate pair - leading and trailing code with braces"() { + given: + def input = '''"hello \\u{d83c}\\u{df7a}"''' + + when: + String parsed = StringValueParsing.parseSingleQuotedString(input) + + then: + parsed == '''hello 🍺''' // contains the beer icon U+1F37A : http://www.charbase.com/1f37a-unicode-beer-mug + } + + def "invalid surrogate pair - invalid trailing code without unicode escape 1"() { + given: + def input = '''"hello \\u{d83c}{df7a}"''' + + when: + StringValueParsing.parseSingleQuotedString(input) + + then: + RuntimeException e = thrown(RuntimeException) + e.message == "Invalid unicode" + } + + def "invalid surrogate pair - invalid trailing code without unicode escape 2"() { + given: + def input = '''"hello \\u{d83c}df7a"''' + + when: + StringValueParsing.parseSingleQuotedString(input) + + then: + RuntimeException e = thrown(RuntimeException) + e.message == "Invalid unicode" + } + + def "invalid surrogate pair - invalid leading code"() { + given: + def input = '''"hello d83c\\u{df7a}"''' + + when: + StringValueParsing.parseSingleQuotedString(input) + + then: + RuntimeException e = thrown(RuntimeException) + e.message == "Invalid unicode" + } + + def "invalid surrogate pair - invalid leading value with braces"() { + given: + def input = '''"\\u{5B57}\\uDC00"''' + + when: + StringValueParsing.parseSingleQuotedString(input) + + then: + RuntimeException e = thrown(RuntimeException) + e.message == "Invalid unicode" + } + + def "invalid surrogate pair - invalid trailing value with braces"() { + given: + def input = '''"\\uD83D\\u{DBFF}"''' + + when: + StringValueParsing.parseSingleQuotedString(input) + + then: + RuntimeException e = thrown(RuntimeException) + e.message == "Invalid unicode" + } + + def "invalid unicode code point - value is too high"() { + given: + def input = '''"\\u{fffffff}"''' + + when: + StringValueParsing.parseSingleQuotedString(input) + + then: + RuntimeException e = thrown(RuntimeException) + e.message == "Invalid unicode" + } +} From 7f75c6c869fe7bd3410644347391518a38ed97cb Mon Sep 17 00:00:00 2001 From: dondonz <13839920+dondonz@users.noreply.github.com> Date: Thu, 8 Jul 2021 15:51:36 +1000 Subject: [PATCH 07/15] Add more end of string edge cases --- .../parser/UnicodeUtilParserTest.groovy | 86 +++++++++++++------ 1 file changed, 61 insertions(+), 25 deletions(-) diff --git a/src/test/groovy/graphql/parser/UnicodeUtilParserTest.groovy b/src/test/groovy/graphql/parser/UnicodeUtilParserTest.groovy index 76a68b257a..56a0525d6b 100644 --- a/src/test/groovy/graphql/parser/UnicodeUtilParserTest.groovy +++ b/src/test/groovy/graphql/parser/UnicodeUtilParserTest.groovy @@ -7,17 +7,17 @@ import graphql.language.StringValue import spock.lang.Specification class UnicodeUtilParserTest extends Specification { - /* - Implements RFC to support full Unicode https://github.com/graphql/graphql-spec/pull/849 - - Key changes - * SourceCharacters now include all Unicode scalar values. Previously only included up to U+FFFF (Basic Multilingual Plane). - * SourceCharacters now include control characters. Previously certain control characters were excluded. - * Surrogate pair validation added. - - Note that "unescaped" Unicode characters such as 🍺 are handled by ANTLR grammar. - "Escaped" Unicode characters such as \\u{1F37A} are handled by StringValueParsing. - */ + /** + * Implements RFC to support full Unicode https://github.com/graphql/graphql-spec/pull/849 + * + * Key changes + * + SourceCharacters now include all Unicode scalar values. Previously only included up to U+FFFF (Basic Multilingual Plane). + * + SourceCharacters now include control characters. Previously certain control characters were excluded. + * + Surrogate pair validation added. + * + * Note that "unescaped" Unicode characters such as 🍺 are handled by ANTLR grammar. + * "Escaped" Unicode characters such as \\u{1F37A} are handled by StringValueParsing. + */ // With this RFC, escaped code points outside the Basic Multilingual Plane (e.g. emojis) can be parsed. def "parsing beer stein as escaped unicode"() { @@ -60,15 +60,15 @@ class UnicodeUtilParserTest extends Specification { argValue.getValue() == "🍺" // contains the beer icon U+1F37A : http://www.charbase.com/1f37a-unicode-beer-mug } - /* - From the RFC: - For legacy reasons, a *supplementary character* may be escaped by two - fixed-width unicode escape sequences forming a *surrogate pair*. For example - the input `"\\uD83D\\uDCA9"` is a valid {StringValue} which represents the same - Unicode text as `"\\u{1F4A9}"`. While this legacy form is allowed, it should be - avoided as a variable-width unicode escape sequence is a clearer way to encode - such code points. - */ + /** + * From the RFC: + * For legacy reasons, a *supplementary character* may be escaped by two + * fixed-width unicode escape sequences forming a *surrogate pair*. For example + * the input `"\\uD83D\\uDCA9"` is a valid {StringValue} which represents the same + * Unicode text as `"\\u{1F4A9}"`. While this legacy form is allowed, it should be + * avoided as a variable-width unicode escape sequence is a clearer way to encode + * such code points. + */ def "allow surrogate pairs escaped unicode"() { given: def input = ''' @@ -84,13 +84,13 @@ class UnicodeUtilParserTest extends Specification { def argValue = field.arguments[0].value as StringValue then: - argValue.getValue() == "🍺" // contains the beer icon U+1F37A : http://www.charbase.com/1f37a-unicode-beer-mug + argValue.getValue() == "🍺" // contains the beer icon U+1F37 A : http://www.charbase.com/1f37a-unicode-beer-mug } - /* - Valid surrogate pair combinations (from the RFC): - * If {leadingValue} is >= 0xD800 and <= 0xDBFF (a *Leading Surrogate*): - * Assert {trailingValue} is >= 0xDC00 and <= 0xDFFF (a *Trailing Surrogate*). + /** + * Valid surrogate pair combinations (from the RFC): + * + If {leadingValue} is >= 0xD800 and <= 0xDBFF (a *Leading Surrogate*): + * + Assert {trailingValue} is >= 0xDC00 and <= 0xDFFF (a *Trailing Surrogate*). */ def "invalid surrogate pair - no trailing value"() { given: @@ -185,6 +185,42 @@ class UnicodeUtilParserTest extends Specification { parsed == '''hello 🍺''' // contains the beer icon U+1F37A : http://www.charbase.com/1f37a-unicode-beer-mug } + def "invalid surrogate pair - leading code with only \\ at end of string"() { + given: + def input = '''"hello \\u{d83c}\\"''' + + when: + StringValueParsing.parseSingleQuotedString(input) + + then: + RuntimeException e = thrown(RuntimeException) + e.message == "Invalid unicode" + } + + def "invalid surrogate pair - leading code with only \\u at end of string"() { + given: + def input = '''"hello \\u{d83c}\\u"''' + + when: + StringValueParsing.parseSingleQuotedString(input) + + then: + RuntimeException e = thrown(RuntimeException) + e.message == "Invalid unicode" + } + + def "invalid surrogate pair - trailing code without closing brace"() { + given: + def input = '''"hello \\u{d83c}\\u{df7a"''' + + when: + StringValueParsing.parseSingleQuotedString(input) + + then: + RuntimeException e = thrown(RuntimeException) + e.message == "Invalid unicode" + } + def "invalid surrogate pair - invalid trailing code without unicode escape 1"() { given: def input = '''"hello \\u{d83c}{df7a}"''' From 06d334f320314a10fa822471c6eb27e7b1ae766a Mon Sep 17 00:00:00 2001 From: dondonz <13839920+dondonz@users.noreply.github.com> Date: Thu, 8 Jul 2021 15:53:58 +1000 Subject: [PATCH 08/15] Add surrogate pair validation --- src/main/java/graphql/parser/UnicodeUtil.java | 131 ++++++++++++------ 1 file changed, 91 insertions(+), 40 deletions(-) diff --git a/src/main/java/graphql/parser/UnicodeUtil.java b/src/main/java/graphql/parser/UnicodeUtil.java index b1ab251e83..4f1b8f193b 100644 --- a/src/main/java/graphql/parser/UnicodeUtil.java +++ b/src/main/java/graphql/parser/UnicodeUtil.java @@ -1,64 +1,115 @@ package graphql.parser; -import graphql.Assert; import graphql.Internal; import java.io.IOException; import java.io.StringWriter; +import static graphql.Assert.assertShouldNeverHappen; + /** * Contains Unicode helpers for parsing StringValue types in the grammar */ @Internal public class UnicodeUtil { public static int MAX_UNICODE_CODE_POINT = 0x10FFFF; + public static int LEADING_SURROGATE_LOWER_BOUND = 0xD800; + public static int LEADING_SURROGATE_UPPER_BOUND = 0xDBFF; + public static int TRAILING_SURROGATE_LOWER_BOUND = 0xDC00; + public static int TRAILING_SURROGATE_UPPER_BOUND = 0xDFFF; public static int parseAndWriteUnicode(StringWriter writer, String string, int i) { - // Unicode characters can either be: - // - four hex characters in the form \\u597D, or - // - any number of hex characters surrounded by a brace in the form \\u{1F37A} - - // Four hex character only case \\u597D, for code points in the Basic Multilingual Plane (BMP) - if (isNotBracedEscape(string, i)) { - String hexStr = string.substring(i + 1, i + 5); - int codepoint = Integer.parseInt(hexStr, 16); - writer.write(codepoint); - return i + 4; - // TODO error checking of invalid values - } else { - // Any number of hex characters e.g. \\u{1F37A}, which allows code points outside the Basic Multilingual Plane (BMP) - int startIx = i + 2; - int endIndexExclusive = startIx; - do { - if (endIndexExclusive + 1 >= string.length()) { - throw new RuntimeException("invalid unicode encoding"); - } - } while (string.charAt(++endIndexExclusive) != '}'); - - String hexStr = string.substring(startIx, endIndexExclusive); - Integer hexValue = Integer.parseInt(hexStr, 16); - if (isValidUnicodeCodePoint(hexValue)) { - char[] chars = Character.toChars(hexValue); - try { - writer.write(chars); - } catch (IOException e) { - return Assert.assertShouldNeverHappen(); - } - return endIndexExclusive; - } else { - throw new RuntimeException("invalid unicode code point"); + // Unicode code points can either be: + // 1. Unbraced: four hex characters in the form \\u597D, or + // 2. Braced: any number of hex characters surrounded by braces in the form \\u{1F37A} + + // Extract the code point hex digits. Index i points to 'u' + int startIndex = isBracedEscape(string, i) ? i + 2 : i + 1; + int endIndexExclusive = getEndIndexExclusive(string, i); + // Index for parser to continue at, the last character of the escaped unicode character. Either } or hex digit + int continueIndex = isBracedEscape(string, i) ? endIndexExclusive : endIndexExclusive - 1; + + String hexStr = string.substring(startIndex, endIndexExclusive); + Integer codePoint = Integer.parseInt(hexStr, 16); + + if (isTrailingSurrogateValue(codePoint)) { + // A trailing surrogate value must be preceded with a leading surrogate value + throw new RuntimeException("Invalid unicode"); + } else if (isLeadingSurrogateValue(codePoint)) { + // A leading surrogate value must be followed by a trailing surrogate value + if (!isEscapedUnicode(string, continueIndex + 1)) { + throw new RuntimeException("Invalid unicode"); + } + + // Shift parser ahead to 'u' in second escaped Unicode character + i = continueIndex + 2; + int trailingStartIndex = isBracedEscape(string, i) ? i + 2 : i + 1; + int trailingEndIndexExclusive = getEndIndexExclusive(string, i); + String trailingHexStr = string.substring(trailingStartIndex, trailingEndIndexExclusive); + Integer trailingCodePoint = Integer.parseInt(trailingHexStr, 16); + + if (isTrailingSurrogateValue(trailingCodePoint)) { + writeCodePoint(writer, codePoint); + writeCodePoint(writer, trailingCodePoint); + continueIndex = isBracedEscape(string, i) ? trailingEndIndexExclusive : trailingEndIndexExclusive - 1; + return continueIndex; } + + throw new RuntimeException("Invalid unicode"); + } else if (isValidUnicodeCodePoint(codePoint)) { + writeCodePoint(writer, codePoint); + return continueIndex; } -// Assert.assertShouldNeverHappen(); - // TODO error checking of invalid values + + throw new RuntimeException("Invalid unicode"); } - private static boolean isNotBracedEscape(String string, int i) { - return string.charAt(i + 1) != '{'; + private static int getEndIndexExclusive(String string, int i) { + // Unbraced case, with exactly 4 hex digits + if (string.length() > i + 5 && !isBracedEscape(string, i)) { + return i + 5; + } + + // Braced case, with any number of hex digits + int endIndexExclusive = i + 2; + do { + if (endIndexExclusive + 1 >= string.length()) { + throw new RuntimeException("Invalid unicode"); + } + } while (string.charAt(++endIndexExclusive) != '}'); + + return endIndexExclusive; } - private static boolean isValidUnicodeCodePoint(Integer value) { - // TODO: Add bad surrogate checks + private static boolean isValidUnicodeCodePoint(int value) { return value <= MAX_UNICODE_CODE_POINT; } + + private static boolean isEscapedUnicode(String string, int index) { + if (index + 1 >= string.length()) { + throw new RuntimeException("Invalid unicode"); + } + return string.charAt(index) == '\\' && string.charAt(index + 1) == 'u'; + } + + private static boolean isLeadingSurrogateValue(int value) { + return LEADING_SURROGATE_LOWER_BOUND <= value && value <= LEADING_SURROGATE_UPPER_BOUND; + } + + private static boolean isTrailingSurrogateValue(int value) { + return TRAILING_SURROGATE_LOWER_BOUND <= value && value <= TRAILING_SURROGATE_UPPER_BOUND; + } + + private static void writeCodePoint(StringWriter writer, int codepoint) { + char[] chars = Character.toChars(codepoint); + try { + writer.write(chars); + } catch (IOException e) { + assertShouldNeverHappen(); + } + } + + private static boolean isBracedEscape(String string, int i) { + return string.charAt(i + 1) == '{'; + } } From 5e788a420915176a787f34b16faeb84855d26be7 Mon Sep 17 00:00:00 2001 From: dondonz <13839920+dondonz@users.noreply.github.com> Date: Fri, 9 Jul 2021 14:27:17 +1000 Subject: [PATCH 09/15] Raise InvalidSyntaxException for invalid Unicode --- src/main/java/graphql/parser/UnicodeUtil.java | 16 +++--- .../parser/UnicodeUtilParserTest.groovy | 56 +++++++++---------- 2 files changed, 35 insertions(+), 37 deletions(-) diff --git a/src/main/java/graphql/parser/UnicodeUtil.java b/src/main/java/graphql/parser/UnicodeUtil.java index 4f1b8f193b..2d131a5c36 100644 --- a/src/main/java/graphql/parser/UnicodeUtil.java +++ b/src/main/java/graphql/parser/UnicodeUtil.java @@ -33,12 +33,10 @@ public static int parseAndWriteUnicode(StringWriter writer, String string, int i Integer codePoint = Integer.parseInt(hexStr, 16); if (isTrailingSurrogateValue(codePoint)) { - // A trailing surrogate value must be preceded with a leading surrogate value - throw new RuntimeException("Invalid unicode"); + throw new InvalidSyntaxException(null, "Invalid unicode - trailing surrogate must be preceded with a leading surrogate -", null, string.substring(i - 1, continueIndex + 1), null); } else if (isLeadingSurrogateValue(codePoint)) { - // A leading surrogate value must be followed by a trailing surrogate value if (!isEscapedUnicode(string, continueIndex + 1)) { - throw new RuntimeException("Invalid unicode"); + throw new InvalidSyntaxException(null, "Invalid unicode - leading surrogate must be followed by a trailing surrogate -", null, string.substring(i - 1, continueIndex + 1), null); } // Shift parser ahead to 'u' in second escaped Unicode character @@ -47,21 +45,21 @@ public static int parseAndWriteUnicode(StringWriter writer, String string, int i int trailingEndIndexExclusive = getEndIndexExclusive(string, i); String trailingHexStr = string.substring(trailingStartIndex, trailingEndIndexExclusive); Integer trailingCodePoint = Integer.parseInt(trailingHexStr, 16); + continueIndex = isBracedEscape(string, i) ? trailingEndIndexExclusive : trailingEndIndexExclusive - 1; if (isTrailingSurrogateValue(trailingCodePoint)) { writeCodePoint(writer, codePoint); writeCodePoint(writer, trailingCodePoint); - continueIndex = isBracedEscape(string, i) ? trailingEndIndexExclusive : trailingEndIndexExclusive - 1; return continueIndex; } - throw new RuntimeException("Invalid unicode"); + throw new InvalidSyntaxException(null, "Invalid unicode - leading surrogate must be followed by a trailing surrogate -", null, string.substring(i - 1, continueIndex + 1), null); } else if (isValidUnicodeCodePoint(codePoint)) { writeCodePoint(writer, codePoint); return continueIndex; } - throw new RuntimeException("Invalid unicode"); + throw new InvalidSyntaxException(null, "Invalid unicode - not a valid code point -", null, string.substring(i - 1, continueIndex + 1), null); } private static int getEndIndexExclusive(String string, int i) { @@ -74,7 +72,7 @@ private static int getEndIndexExclusive(String string, int i) { int endIndexExclusive = i + 2; do { if (endIndexExclusive + 1 >= string.length()) { - throw new RuntimeException("Invalid unicode"); + throw new InvalidSyntaxException(null, "Invalid unicode - incorrectly formatted escape -", null, string.substring(i - 1, endIndexExclusive), null); } } while (string.charAt(++endIndexExclusive) != '}'); @@ -87,7 +85,7 @@ private static boolean isValidUnicodeCodePoint(int value) { private static boolean isEscapedUnicode(String string, int index) { if (index + 1 >= string.length()) { - throw new RuntimeException("Invalid unicode"); + return false; } return string.charAt(index) == '\\' && string.charAt(index + 1) == 'u'; } diff --git a/src/test/groovy/graphql/parser/UnicodeUtilParserTest.groovy b/src/test/groovy/graphql/parser/UnicodeUtilParserTest.groovy index 56a0525d6b..93ab188cf7 100644 --- a/src/test/groovy/graphql/parser/UnicodeUtilParserTest.groovy +++ b/src/test/groovy/graphql/parser/UnicodeUtilParserTest.groovy @@ -100,8 +100,8 @@ class UnicodeUtilParserTest extends Specification { StringValueParsing.parseSingleQuotedString(input) then: - RuntimeException e = thrown(RuntimeException) - e.message == "Invalid unicode" + InvalidSyntaxException e = thrown(InvalidSyntaxException) + e.message == "Invalid Syntax : Invalid unicode - leading surrogate must be followed by a trailing surrogate - offending token '\\uD83D'" } def "invalid surrogate pair - end of string"() { @@ -112,8 +112,8 @@ class UnicodeUtilParserTest extends Specification { StringValueParsing.parseSingleQuotedString(input) then: - RuntimeException e = thrown(RuntimeException) - e.message == "Invalid unicode" + InvalidSyntaxException e = thrown(InvalidSyntaxException) + e.message == "Invalid Syntax : Invalid unicode - leading surrogate must be followed by a trailing surrogate - offending token '\\uD83D'" } def "invalid surrogate pair - invalid trailing value"() { @@ -124,8 +124,8 @@ class UnicodeUtilParserTest extends Specification { StringValueParsing.parseSingleQuotedString(input) then: - RuntimeException e = thrown(RuntimeException) - e.message == "Invalid unicode" + InvalidSyntaxException e = thrown(InvalidSyntaxException) + e.message == "Invalid Syntax : Invalid unicode - leading surrogate must be followed by a trailing surrogate - offending token '\\uDBFF'" } def "invalid surrogate pair - no leading value"() { @@ -136,8 +136,8 @@ class UnicodeUtilParserTest extends Specification { StringValueParsing.parseSingleQuotedString(input) then: - RuntimeException e = thrown(RuntimeException) - e.message == "Invalid unicode" + InvalidSyntaxException e = thrown(InvalidSyntaxException) + e.message == "Invalid Syntax : Invalid unicode - trailing surrogate must be preceded with a leading surrogate - offending token '\\uDC00'" } def "invalid surrogate pair - invalid leading value"() { @@ -148,8 +148,8 @@ class UnicodeUtilParserTest extends Specification { StringValueParsing.parseSingleQuotedString(input) then: - RuntimeException e = thrown(RuntimeException) - e.message == "Invalid unicode" + InvalidSyntaxException e = thrown(InvalidSyntaxException) + e.message == "Invalid Syntax : Invalid unicode - trailing surrogate must be preceded with a leading surrogate - offending token '\\uDC00'" } def "valid surrogate pair - leading code with braces"() { @@ -193,8 +193,8 @@ class UnicodeUtilParserTest extends Specification { StringValueParsing.parseSingleQuotedString(input) then: - RuntimeException e = thrown(RuntimeException) - e.message == "Invalid unicode" + InvalidSyntaxException e = thrown(InvalidSyntaxException) + e.message == "Invalid Syntax : Invalid unicode - leading surrogate must be followed by a trailing surrogate - offending token '\\u{d83c}'" } def "invalid surrogate pair - leading code with only \\u at end of string"() { @@ -205,8 +205,8 @@ class UnicodeUtilParserTest extends Specification { StringValueParsing.parseSingleQuotedString(input) then: - RuntimeException e = thrown(RuntimeException) - e.message == "Invalid unicode" + InvalidSyntaxException e = thrown(InvalidSyntaxException) + e.message == "Invalid Syntax : Invalid unicode - incorrectly formatted escape - offending token '\\u\"'" } def "invalid surrogate pair - trailing code without closing brace"() { @@ -217,8 +217,8 @@ class UnicodeUtilParserTest extends Specification { StringValueParsing.parseSingleQuotedString(input) then: - RuntimeException e = thrown(RuntimeException) - e.message == "Invalid unicode" + InvalidSyntaxException e = thrown(InvalidSyntaxException) + e.message == "Invalid Syntax : Invalid unicode - incorrectly formatted escape - offending token '\\u{df7a'" } def "invalid surrogate pair - invalid trailing code without unicode escape 1"() { @@ -229,8 +229,8 @@ class UnicodeUtilParserTest extends Specification { StringValueParsing.parseSingleQuotedString(input) then: - RuntimeException e = thrown(RuntimeException) - e.message == "Invalid unicode" + InvalidSyntaxException e = thrown(InvalidSyntaxException) + e.message == "Invalid Syntax : Invalid unicode - leading surrogate must be followed by a trailing surrogate - offending token '\\u{d83c}'" } def "invalid surrogate pair - invalid trailing code without unicode escape 2"() { @@ -241,8 +241,8 @@ class UnicodeUtilParserTest extends Specification { StringValueParsing.parseSingleQuotedString(input) then: - RuntimeException e = thrown(RuntimeException) - e.message == "Invalid unicode" + InvalidSyntaxException e = thrown(InvalidSyntaxException) + e.message == "Invalid Syntax : Invalid unicode - leading surrogate must be followed by a trailing surrogate - offending token '\\u{d83c}'" } def "invalid surrogate pair - invalid leading code"() { @@ -253,8 +253,8 @@ class UnicodeUtilParserTest extends Specification { StringValueParsing.parseSingleQuotedString(input) then: - RuntimeException e = thrown(RuntimeException) - e.message == "Invalid unicode" + InvalidSyntaxException e = thrown(InvalidSyntaxException) + e.message == "Invalid Syntax : Invalid unicode - trailing surrogate must be preceded with a leading surrogate - offending token '\\u{df7a}'" } def "invalid surrogate pair - invalid leading value with braces"() { @@ -265,8 +265,8 @@ class UnicodeUtilParserTest extends Specification { StringValueParsing.parseSingleQuotedString(input) then: - RuntimeException e = thrown(RuntimeException) - e.message == "Invalid unicode" + InvalidSyntaxException e = thrown(InvalidSyntaxException) + e.message == "Invalid Syntax : Invalid unicode - trailing surrogate must be preceded with a leading surrogate - offending token '\\uDC00'" } def "invalid surrogate pair - invalid trailing value with braces"() { @@ -277,8 +277,8 @@ class UnicodeUtilParserTest extends Specification { StringValueParsing.parseSingleQuotedString(input) then: - RuntimeException e = thrown(RuntimeException) - e.message == "Invalid unicode" + InvalidSyntaxException e = thrown(InvalidSyntaxException) + e.message == "Invalid Syntax : Invalid unicode - leading surrogate must be followed by a trailing surrogate - offending token '\\u{DBFF}'" } def "invalid unicode code point - value is too high"() { @@ -289,7 +289,7 @@ class UnicodeUtilParserTest extends Specification { StringValueParsing.parseSingleQuotedString(input) then: - RuntimeException e = thrown(RuntimeException) - e.message == "Invalid unicode" + InvalidSyntaxException e = thrown(InvalidSyntaxException) + e.message == "Invalid Syntax : Invalid unicode - not a valid code point - offending token '\\u{fffffff}'" } } From dd290eabdc219ba5e725e0364cddcd2210fa169b Mon Sep 17 00:00:00 2001 From: dondonz <13839920+dondonz@users.noreply.github.com> Date: Wed, 14 Jul 2021 11:54:02 +1000 Subject: [PATCH 10/15] Update ANTLR grammar with new SourceCharacter definition --- src/main/antlr/GraphqlCommon.g4 | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/src/main/antlr/GraphqlCommon.g4 b/src/main/antlr/GraphqlCommon.g4 index 5ace483ff9..5ee4110a77 100644 --- a/src/main/antlr/GraphqlCommon.g4 +++ b/src/main/antlr/GraphqlCommon.g4 @@ -117,10 +117,15 @@ StringValue: fragment BlockStringCharacter: '\\"""'| -ExtendedSourceCharacter; +SourceCharacter; +// this is SourceCharacter without +// \u000a New line +// \u000d Carriage return +// \u0022 '"' +// \u005c '\' fragment StringCharacter: -([\u0009\u0020\u0021] | [\u0023-\u005b] | [\u005d-\u{10FFFF}]) | // this is SourceCharacter without '"' and '\' +([\u0000-\u0009] | [\u000b\u000c\u000e-\u0021] | [\u0023-\u005b] | [\u005d-\ud7ff] | [\ue000-\u{10ffff}]) | '\\u' EscapedUnicode | '\\' EscapedCharacter; @@ -128,20 +133,13 @@ fragment EscapedCharacter : ["\\/bfnrt]; fragment EscapedUnicode : Hex Hex Hex Hex | '{' Hex+ '}'; fragment Hex : [0-9a-fA-F]; +// this is the spec definition. Excludes surrogate leading and trailing values. +fragment SourceCharacter : [\u0000-\ud7ff] | [\ue000-\u{10ffff}]; -// this is currently not covered by the spec because we allow all unicode chars -// u0009 = \t Horizontal tab -// u000a = \n line feed -// u000d = \r carriage return -// u0020 = space -fragment ExtendedSourceCharacter :[\u0009\u000A\u000D\u0020-\u{10FFFF}]; -fragment ExtendedSourceCharacterWithoutLineFeed :[\u0009\u0020-\u{10FFFF}]; +// CommentChar +fragment SourceCharacterWithoutLineFeed : [\u0000-\u0009] | [\u000b\u000c\u000e-\ud7ff] | [\ue000-\u{10ffff}]; -// this is the spec definition -// fragment SourceCharacter :[\u0009\u000A\u000D\u0020-\uFFFF]; - - -Comment: '#' ExtendedSourceCharacterWithoutLineFeed* -> channel(2); +Comment: '#' SourceCharacterWithoutLineFeed* -> channel(2); LF: [\n] -> channel(3); CR: [\r] -> channel(3); From 2287201f77044cdc425b303f13a0219c903f88c1 Mon Sep 17 00:00:00 2001 From: dondonz <13839920+dondonz@users.noreply.github.com> Date: Wed, 14 Jul 2021 12:04:48 +1000 Subject: [PATCH 11/15] Tidy test name and location --- .../groovy/graphql/parser/ParserTest.groovy | 36 ++++++++++++++++ ...y => StringValueParsingUnicodeTest.groovy} | 43 ++----------------- 2 files changed, 39 insertions(+), 40 deletions(-) rename src/test/groovy/graphql/parser/{UnicodeUtilParserTest.groovy => StringValueParsingUnicodeTest.groovy} (87%) diff --git a/src/test/groovy/graphql/parser/ParserTest.groovy b/src/test/groovy/graphql/parser/ParserTest.groovy index 790491e80b..c39da47874 100644 --- a/src/test/groovy/graphql/parser/ParserTest.groovy +++ b/src/test/groovy/graphql/parser/ParserTest.groovy @@ -976,4 +976,40 @@ triple3 : """edge cases \\""" "" " \\"" \\" edge cases""" !type.getIgnoredChars().getLeft().isEmpty() !type.getIgnoredChars().getRight().isEmpty() } + + def "allow braced escaped unicode"() { + given: + def input = ''' + { + foo(arg: "\\u{1F37A}") + } + ''' + + when: + Document document = Parser.parse(input) + OperationDefinition operationDefinition = (document.definitions[0] as OperationDefinition) + def field = operationDefinition.getSelectionSet().getSelections()[0] as Field + def argValue = field.arguments[0].value as StringValue + + then: + argValue.getValue() == "🍺" // contains the beer icon U+1F37A : http://www.charbase.com/1f37a-unicode-beer-mug + } + + def "allow surrogate pairs escaped unicode"() { + given: + def input = ''' + { + foo(arg: "\\ud83c\\udf7a") + } + ''' + + when: + Document document = Parser.parse(input) + OperationDefinition operationDefinition = (document.definitions[0] as OperationDefinition) + def field = operationDefinition.getSelectionSet().getSelections()[0] as Field + def argValue = field.arguments[0].value as StringValue + + then: + argValue.getValue() == "🍺" // contains the beer icon U+1F37 A : http://www.charbase.com/1f37a-unicode-beer-mug + } } diff --git a/src/test/groovy/graphql/parser/UnicodeUtilParserTest.groovy b/src/test/groovy/graphql/parser/StringValueParsingUnicodeTest.groovy similarity index 87% rename from src/test/groovy/graphql/parser/UnicodeUtilParserTest.groovy rename to src/test/groovy/graphql/parser/StringValueParsingUnicodeTest.groovy index 93ab188cf7..63c59d8011 100644 --- a/src/test/groovy/graphql/parser/UnicodeUtilParserTest.groovy +++ b/src/test/groovy/graphql/parser/StringValueParsingUnicodeTest.groovy @@ -6,7 +6,7 @@ import graphql.language.OperationDefinition import graphql.language.StringValue import spock.lang.Specification -class UnicodeUtilParserTest extends Specification { +class StringValueParsingUnicodeTest extends Specification { /** * Implements RFC to support full Unicode https://github.com/graphql/graphql-spec/pull/849 * @@ -42,24 +42,6 @@ class UnicodeUtilParserTest extends Specification { parsed == '''🍺 hello''' // contains the beer icon U+1F37A : http://www.charbase.com/1f37a-unicode-beer-mug } - def "allow braced escaped unicode"() { - given: - def input = ''' - { - foo(arg: "\\u{1F37A}") - } - ''' - - when: - Document document = Parser.parse(input) - OperationDefinition operationDefinition = (document.definitions[0] as OperationDefinition) - def field = operationDefinition.getSelectionSet().getSelections()[0] as Field - def argValue = field.arguments[0].value as StringValue - - then: - argValue.getValue() == "🍺" // contains the beer icon U+1F37A : http://www.charbase.com/1f37a-unicode-beer-mug - } - /** * From the RFC: * For legacy reasons, a *supplementary character* may be escaped by two @@ -68,27 +50,8 @@ class UnicodeUtilParserTest extends Specification { * Unicode text as `"\\u{1F4A9}"`. While this legacy form is allowed, it should be * avoided as a variable-width unicode escape sequence is a clearer way to encode * such code points. - */ - def "allow surrogate pairs escaped unicode"() { - given: - def input = ''' - { - foo(arg: "\\ud83c\\udf7a") - } - ''' - - when: - Document document = Parser.parse(input) - OperationDefinition operationDefinition = (document.definitions[0] as OperationDefinition) - def field = operationDefinition.getSelectionSet().getSelections()[0] as Field - def argValue = field.arguments[0].value as StringValue - - then: - argValue.getValue() == "🍺" // contains the beer icon U+1F37 A : http://www.charbase.com/1f37a-unicode-beer-mug - } - - /** - * Valid surrogate pair combinations (from the RFC): + * + * Valid surrogate pair combinations: * + If {leadingValue} is >= 0xD800 and <= 0xDBFF (a *Leading Surrogate*): * + Assert {trailingValue} is >= 0xDC00 and <= 0xDFFF (a *Trailing Surrogate*). */ From c93fd5a0d25e8952a68674a8413377a7a139fc7b Mon Sep 17 00:00:00 2001 From: dondonz <13839920+dondonz@users.noreply.github.com> Date: Wed, 14 Jul 2021 12:35:48 +1000 Subject: [PATCH 12/15] Add source location to Unicode error messages --- src/main/java/graphql/parser/AntlrHelper.java | 4 ++ .../parser/GraphqlAntlrToLanguage.java | 9 +++-- .../graphql/parser/StringValueParsing.java | 5 ++- src/main/java/graphql/parser/UnicodeUtil.java | 19 +++++----- .../StringValueParsingUnicodeTest.groovy | 38 +++++++++---------- 5 files changed, 41 insertions(+), 34 deletions(-) diff --git a/src/main/java/graphql/parser/AntlrHelper.java b/src/main/java/graphql/parser/AntlrHelper.java index a3fc74d272..4eb52d87ba 100644 --- a/src/main/java/graphql/parser/AntlrHelper.java +++ b/src/main/java/graphql/parser/AntlrHelper.java @@ -3,6 +3,7 @@ import graphql.Internal; import graphql.language.SourceLocation; import org.antlr.v4.runtime.Token; +import org.antlr.v4.runtime.tree.TerminalNode; import java.util.List; @@ -28,6 +29,9 @@ public static SourceLocation createSourceLocation(MultiSourceReader multiSourceR return AntlrHelper.createSourceLocation(multiSourceReader, token.getLine(), token.getCharPositionInLine()); } + public static SourceLocation createSourceLocation(MultiSourceReader multiSourceReader, TerminalNode terminalNode) { + return AntlrHelper.createSourceLocation(multiSourceReader, terminalNode.getSymbol().getLine(), terminalNode.getSymbol().getCharPositionInLine()); + } /* grabs 3 lines before and after the syntax error */ public static String createPreview(MultiSourceReader multiSourceReader, int antrlLine) { diff --git a/src/main/java/graphql/parser/GraphqlAntlrToLanguage.java b/src/main/java/graphql/parser/GraphqlAntlrToLanguage.java index 770a590c27..6e9ecfe6c1 100644 --- a/src/main/java/graphql/parser/GraphqlAntlrToLanguage.java +++ b/src/main/java/graphql/parser/GraphqlAntlrToLanguage.java @@ -760,13 +760,14 @@ protected Value createValue(GraphqlParser.ValueContext ctx) { return assertShouldNeverHappen(); } - static String quotedString(TerminalNode terminalNode) { + protected String quotedString(TerminalNode terminalNode) { boolean multiLine = terminalNode.getText().startsWith("\"\"\""); String strText = terminalNode.getText(); + SourceLocation sourceLocation = AntlrHelper.createSourceLocation(multiSourceReader, terminalNode); if (multiLine) { return parseTripleQuotedString(strText); } else { - return parseSingleQuotedString(strText); + return parseSingleQuotedString(strText, sourceLocation); } } @@ -839,12 +840,12 @@ protected Description newDescription(GraphqlParser.DescriptionContext descriptio } String content = terminalNode.getText(); boolean multiLine = content.startsWith("\"\"\""); + SourceLocation sourceLocation = getSourceLocation(descriptionCtx); if (multiLine) { content = parseTripleQuotedString(content); } else { - content = parseSingleQuotedString(content); + content = parseSingleQuotedString(content, sourceLocation); } - SourceLocation sourceLocation = getSourceLocation(descriptionCtx); return new Description(content, sourceLocation, multiLine); } diff --git a/src/main/java/graphql/parser/StringValueParsing.java b/src/main/java/graphql/parser/StringValueParsing.java index 83b214cab1..4fe1219061 100644 --- a/src/main/java/graphql/parser/StringValueParsing.java +++ b/src/main/java/graphql/parser/StringValueParsing.java @@ -2,6 +2,7 @@ import graphql.Assert; import graphql.Internal; +import graphql.language.SourceLocation; import java.io.StringWriter; import java.util.ArrayList; @@ -102,7 +103,7 @@ private static boolean containsOnlyWhiteSpace(String str) { return leadingWhitespace(str) == str.length(); } - public static String parseSingleQuotedString(String string) { + public static String parseSingleQuotedString(String string, SourceLocation sourceLocation) { StringWriter writer = new StringWriter(string.length() - 2); int end = string.length() - 1; for (int i = 1; i < end; i++) { @@ -139,7 +140,7 @@ public static String parseSingleQuotedString(String string) { writer.write('\t'); continue; case 'u': - i = UnicodeUtil.parseAndWriteUnicode(writer, string, i); + i = UnicodeUtil.parseAndWriteUnicode(writer, string, i, sourceLocation); continue; default: Assert.assertShouldNeverHappen(); diff --git a/src/main/java/graphql/parser/UnicodeUtil.java b/src/main/java/graphql/parser/UnicodeUtil.java index 2d131a5c36..53d1cefe05 100644 --- a/src/main/java/graphql/parser/UnicodeUtil.java +++ b/src/main/java/graphql/parser/UnicodeUtil.java @@ -1,6 +1,7 @@ package graphql.parser; import graphql.Internal; +import graphql.language.SourceLocation; import java.io.IOException; import java.io.StringWriter; @@ -18,14 +19,14 @@ public class UnicodeUtil { public static int TRAILING_SURROGATE_LOWER_BOUND = 0xDC00; public static int TRAILING_SURROGATE_UPPER_BOUND = 0xDFFF; - public static int parseAndWriteUnicode(StringWriter writer, String string, int i) { + public static int parseAndWriteUnicode(StringWriter writer, String string, int i, SourceLocation sourceLocation) { // Unicode code points can either be: // 1. Unbraced: four hex characters in the form \\u597D, or // 2. Braced: any number of hex characters surrounded by braces in the form \\u{1F37A} // Extract the code point hex digits. Index i points to 'u' int startIndex = isBracedEscape(string, i) ? i + 2 : i + 1; - int endIndexExclusive = getEndIndexExclusive(string, i); + int endIndexExclusive = getEndIndexExclusive(string, i, sourceLocation); // Index for parser to continue at, the last character of the escaped unicode character. Either } or hex digit int continueIndex = isBracedEscape(string, i) ? endIndexExclusive : endIndexExclusive - 1; @@ -33,16 +34,16 @@ public static int parseAndWriteUnicode(StringWriter writer, String string, int i Integer codePoint = Integer.parseInt(hexStr, 16); if (isTrailingSurrogateValue(codePoint)) { - throw new InvalidSyntaxException(null, "Invalid unicode - trailing surrogate must be preceded with a leading surrogate -", null, string.substring(i - 1, continueIndex + 1), null); + throw new InvalidSyntaxException(sourceLocation, "Invalid unicode - trailing surrogate must be preceded with a leading surrogate -", null, string.substring(i - 1, continueIndex + 1), null); } else if (isLeadingSurrogateValue(codePoint)) { if (!isEscapedUnicode(string, continueIndex + 1)) { - throw new InvalidSyntaxException(null, "Invalid unicode - leading surrogate must be followed by a trailing surrogate -", null, string.substring(i - 1, continueIndex + 1), null); + throw new InvalidSyntaxException(sourceLocation, "Invalid unicode - leading surrogate must be followed by a trailing surrogate -", null, string.substring(i - 1, continueIndex + 1), null); } // Shift parser ahead to 'u' in second escaped Unicode character i = continueIndex + 2; int trailingStartIndex = isBracedEscape(string, i) ? i + 2 : i + 1; - int trailingEndIndexExclusive = getEndIndexExclusive(string, i); + int trailingEndIndexExclusive = getEndIndexExclusive(string, i, sourceLocation); String trailingHexStr = string.substring(trailingStartIndex, trailingEndIndexExclusive); Integer trailingCodePoint = Integer.parseInt(trailingHexStr, 16); continueIndex = isBracedEscape(string, i) ? trailingEndIndexExclusive : trailingEndIndexExclusive - 1; @@ -53,16 +54,16 @@ public static int parseAndWriteUnicode(StringWriter writer, String string, int i return continueIndex; } - throw new InvalidSyntaxException(null, "Invalid unicode - leading surrogate must be followed by a trailing surrogate -", null, string.substring(i - 1, continueIndex + 1), null); + throw new InvalidSyntaxException(sourceLocation, "Invalid unicode - leading surrogate must be followed by a trailing surrogate -", null, string.substring(i - 1, continueIndex + 1), null); } else if (isValidUnicodeCodePoint(codePoint)) { writeCodePoint(writer, codePoint); return continueIndex; } - throw new InvalidSyntaxException(null, "Invalid unicode - not a valid code point -", null, string.substring(i - 1, continueIndex + 1), null); + throw new InvalidSyntaxException(sourceLocation, "Invalid unicode - not a valid code point -", null, string.substring(i - 1, continueIndex + 1), null); } - private static int getEndIndexExclusive(String string, int i) { + private static int getEndIndexExclusive(String string, int i, SourceLocation sourceLocation) { // Unbraced case, with exactly 4 hex digits if (string.length() > i + 5 && !isBracedEscape(string, i)) { return i + 5; @@ -72,7 +73,7 @@ private static int getEndIndexExclusive(String string, int i) { int endIndexExclusive = i + 2; do { if (endIndexExclusive + 1 >= string.length()) { - throw new InvalidSyntaxException(null, "Invalid unicode - incorrectly formatted escape -", null, string.substring(i - 1, endIndexExclusive), null); + throw new InvalidSyntaxException(sourceLocation, "Invalid unicode - incorrectly formatted escape -", null, string.substring(i - 1, endIndexExclusive), null); } } while (string.charAt(++endIndexExclusive) != '}'); diff --git a/src/test/groovy/graphql/parser/StringValueParsingUnicodeTest.groovy b/src/test/groovy/graphql/parser/StringValueParsingUnicodeTest.groovy index 63c59d8011..ca8f2035fb 100644 --- a/src/test/groovy/graphql/parser/StringValueParsingUnicodeTest.groovy +++ b/src/test/groovy/graphql/parser/StringValueParsingUnicodeTest.groovy @@ -25,7 +25,7 @@ class StringValueParsingUnicodeTest extends Specification { def input = '''"\\u{1F37A} hello"''' when: - String parsed = StringValueParsing.parseSingleQuotedString(input) + String parsed = StringValueParsing.parseSingleQuotedString(input, null) then: parsed == '''🍺 hello''' // contains the beer icon U+1F37A : http://www.charbase.com/1f37a-unicode-beer-mug @@ -36,7 +36,7 @@ class StringValueParsingUnicodeTest extends Specification { def input = '''"🍺 hello"''' when: - String parsed = StringValueParsing.parseSingleQuotedString(input) + String parsed = StringValueParsing.parseSingleQuotedString(input, null) then: parsed == '''🍺 hello''' // contains the beer icon U+1F37A : http://www.charbase.com/1f37a-unicode-beer-mug @@ -60,7 +60,7 @@ class StringValueParsingUnicodeTest extends Specification { def input = '''"\\uD83D hello"''' when: - StringValueParsing.parseSingleQuotedString(input) + StringValueParsing.parseSingleQuotedString(input, null) then: InvalidSyntaxException e = thrown(InvalidSyntaxException) @@ -72,7 +72,7 @@ class StringValueParsingUnicodeTest extends Specification { def input = '''"\\uD83D"''' when: - StringValueParsing.parseSingleQuotedString(input) + StringValueParsing.parseSingleQuotedString(input, null) then: InvalidSyntaxException e = thrown(InvalidSyntaxException) @@ -84,7 +84,7 @@ class StringValueParsingUnicodeTest extends Specification { def input = '''"\\uD83D\\uDBFF"''' when: - StringValueParsing.parseSingleQuotedString(input) + StringValueParsing.parseSingleQuotedString(input, null) then: InvalidSyntaxException e = thrown(InvalidSyntaxException) @@ -96,7 +96,7 @@ class StringValueParsingUnicodeTest extends Specification { def input = '''"\\uDC00"''' when: - StringValueParsing.parseSingleQuotedString(input) + StringValueParsing.parseSingleQuotedString(input, null) then: InvalidSyntaxException e = thrown(InvalidSyntaxException) @@ -108,7 +108,7 @@ class StringValueParsingUnicodeTest extends Specification { def input = '''"\\uD700\\uDC00"''' when: - StringValueParsing.parseSingleQuotedString(input) + StringValueParsing.parseSingleQuotedString(input, null) then: InvalidSyntaxException e = thrown(InvalidSyntaxException) @@ -120,7 +120,7 @@ class StringValueParsingUnicodeTest extends Specification { def input = '''"hello \\u{d83c}\\udf7a"''' when: - String parsed = StringValueParsing.parseSingleQuotedString(input) + String parsed = StringValueParsing.parseSingleQuotedString(input, null) then: parsed == '''hello 🍺''' // contains the beer icon U+1F37 A : http://www.charbase.com/1f37a-unicode-beer-mug @@ -131,7 +131,7 @@ class StringValueParsingUnicodeTest extends Specification { def input = '''"hello \\ud83c\\u{df7a}"''' when: - String parsed = StringValueParsing.parseSingleQuotedString(input) + String parsed = StringValueParsing.parseSingleQuotedString(input, null) then: parsed == '''hello 🍺''' // contains the beer icon U+1F37A : http://www.charbase.com/1f37a-unicode-beer-mug @@ -142,7 +142,7 @@ class StringValueParsingUnicodeTest extends Specification { def input = '''"hello \\u{d83c}\\u{df7a}"''' when: - String parsed = StringValueParsing.parseSingleQuotedString(input) + String parsed = StringValueParsing.parseSingleQuotedString(input, null) then: parsed == '''hello 🍺''' // contains the beer icon U+1F37A : http://www.charbase.com/1f37a-unicode-beer-mug @@ -153,7 +153,7 @@ class StringValueParsingUnicodeTest extends Specification { def input = '''"hello \\u{d83c}\\"''' when: - StringValueParsing.parseSingleQuotedString(input) + StringValueParsing.parseSingleQuotedString(input, null) then: InvalidSyntaxException e = thrown(InvalidSyntaxException) @@ -165,7 +165,7 @@ class StringValueParsingUnicodeTest extends Specification { def input = '''"hello \\u{d83c}\\u"''' when: - StringValueParsing.parseSingleQuotedString(input) + StringValueParsing.parseSingleQuotedString(input, null) then: InvalidSyntaxException e = thrown(InvalidSyntaxException) @@ -177,7 +177,7 @@ class StringValueParsingUnicodeTest extends Specification { def input = '''"hello \\u{d83c}\\u{df7a"''' when: - StringValueParsing.parseSingleQuotedString(input) + StringValueParsing.parseSingleQuotedString(input, null) then: InvalidSyntaxException e = thrown(InvalidSyntaxException) @@ -189,7 +189,7 @@ class StringValueParsingUnicodeTest extends Specification { def input = '''"hello \\u{d83c}{df7a}"''' when: - StringValueParsing.parseSingleQuotedString(input) + StringValueParsing.parseSingleQuotedString(input, null) then: InvalidSyntaxException e = thrown(InvalidSyntaxException) @@ -201,7 +201,7 @@ class StringValueParsingUnicodeTest extends Specification { def input = '''"hello \\u{d83c}df7a"''' when: - StringValueParsing.parseSingleQuotedString(input) + StringValueParsing.parseSingleQuotedString(input, null) then: InvalidSyntaxException e = thrown(InvalidSyntaxException) @@ -213,7 +213,7 @@ class StringValueParsingUnicodeTest extends Specification { def input = '''"hello d83c\\u{df7a}"''' when: - StringValueParsing.parseSingleQuotedString(input) + StringValueParsing.parseSingleQuotedString(input, null) then: InvalidSyntaxException e = thrown(InvalidSyntaxException) @@ -225,7 +225,7 @@ class StringValueParsingUnicodeTest extends Specification { def input = '''"\\u{5B57}\\uDC00"''' when: - StringValueParsing.parseSingleQuotedString(input) + StringValueParsing.parseSingleQuotedString(input, null) then: InvalidSyntaxException e = thrown(InvalidSyntaxException) @@ -237,7 +237,7 @@ class StringValueParsingUnicodeTest extends Specification { def input = '''"\\uD83D\\u{DBFF}"''' when: - StringValueParsing.parseSingleQuotedString(input) + StringValueParsing.parseSingleQuotedString(input, null) then: InvalidSyntaxException e = thrown(InvalidSyntaxException) @@ -249,7 +249,7 @@ class StringValueParsingUnicodeTest extends Specification { def input = '''"\\u{fffffff}"''' when: - StringValueParsing.parseSingleQuotedString(input) + StringValueParsing.parseSingleQuotedString(input, null) then: InvalidSyntaxException e = thrown(InvalidSyntaxException) From 66c0f975f5820cc6f76151c5036a720ce4a89a2e Mon Sep 17 00:00:00 2001 From: dondonz <13839920+dondonz@users.noreply.github.com> Date: Wed, 14 Jul 2021 15:31:30 +1000 Subject: [PATCH 13/15] Add String parser overload when SourceLocation is not available --- .../graphql/parser/StringValueParsing.java | 4 ++ .../StringValueParsingUnicodeTest.groovy | 38 +++++++++---------- 2 files changed, 23 insertions(+), 19 deletions(-) diff --git a/src/main/java/graphql/parser/StringValueParsing.java b/src/main/java/graphql/parser/StringValueParsing.java index 4fe1219061..d9da00dc83 100644 --- a/src/main/java/graphql/parser/StringValueParsing.java +++ b/src/main/java/graphql/parser/StringValueParsing.java @@ -148,4 +148,8 @@ public static String parseSingleQuotedString(String string, SourceLocation sourc } return writer.toString(); } + + public static String parseSingleQuotedString(String string) { + return parseSingleQuotedString(string, null); + } } diff --git a/src/test/groovy/graphql/parser/StringValueParsingUnicodeTest.groovy b/src/test/groovy/graphql/parser/StringValueParsingUnicodeTest.groovy index ca8f2035fb..63c59d8011 100644 --- a/src/test/groovy/graphql/parser/StringValueParsingUnicodeTest.groovy +++ b/src/test/groovy/graphql/parser/StringValueParsingUnicodeTest.groovy @@ -25,7 +25,7 @@ class StringValueParsingUnicodeTest extends Specification { def input = '''"\\u{1F37A} hello"''' when: - String parsed = StringValueParsing.parseSingleQuotedString(input, null) + String parsed = StringValueParsing.parseSingleQuotedString(input) then: parsed == '''🍺 hello''' // contains the beer icon U+1F37A : http://www.charbase.com/1f37a-unicode-beer-mug @@ -36,7 +36,7 @@ class StringValueParsingUnicodeTest extends Specification { def input = '''"🍺 hello"''' when: - String parsed = StringValueParsing.parseSingleQuotedString(input, null) + String parsed = StringValueParsing.parseSingleQuotedString(input) then: parsed == '''🍺 hello''' // contains the beer icon U+1F37A : http://www.charbase.com/1f37a-unicode-beer-mug @@ -60,7 +60,7 @@ class StringValueParsingUnicodeTest extends Specification { def input = '''"\\uD83D hello"''' when: - StringValueParsing.parseSingleQuotedString(input, null) + StringValueParsing.parseSingleQuotedString(input) then: InvalidSyntaxException e = thrown(InvalidSyntaxException) @@ -72,7 +72,7 @@ class StringValueParsingUnicodeTest extends Specification { def input = '''"\\uD83D"''' when: - StringValueParsing.parseSingleQuotedString(input, null) + StringValueParsing.parseSingleQuotedString(input) then: InvalidSyntaxException e = thrown(InvalidSyntaxException) @@ -84,7 +84,7 @@ class StringValueParsingUnicodeTest extends Specification { def input = '''"\\uD83D\\uDBFF"''' when: - StringValueParsing.parseSingleQuotedString(input, null) + StringValueParsing.parseSingleQuotedString(input) then: InvalidSyntaxException e = thrown(InvalidSyntaxException) @@ -96,7 +96,7 @@ class StringValueParsingUnicodeTest extends Specification { def input = '''"\\uDC00"''' when: - StringValueParsing.parseSingleQuotedString(input, null) + StringValueParsing.parseSingleQuotedString(input) then: InvalidSyntaxException e = thrown(InvalidSyntaxException) @@ -108,7 +108,7 @@ class StringValueParsingUnicodeTest extends Specification { def input = '''"\\uD700\\uDC00"''' when: - StringValueParsing.parseSingleQuotedString(input, null) + StringValueParsing.parseSingleQuotedString(input) then: InvalidSyntaxException e = thrown(InvalidSyntaxException) @@ -120,7 +120,7 @@ class StringValueParsingUnicodeTest extends Specification { def input = '''"hello \\u{d83c}\\udf7a"''' when: - String parsed = StringValueParsing.parseSingleQuotedString(input, null) + String parsed = StringValueParsing.parseSingleQuotedString(input) then: parsed == '''hello 🍺''' // contains the beer icon U+1F37 A : http://www.charbase.com/1f37a-unicode-beer-mug @@ -131,7 +131,7 @@ class StringValueParsingUnicodeTest extends Specification { def input = '''"hello \\ud83c\\u{df7a}"''' when: - String parsed = StringValueParsing.parseSingleQuotedString(input, null) + String parsed = StringValueParsing.parseSingleQuotedString(input) then: parsed == '''hello 🍺''' // contains the beer icon U+1F37A : http://www.charbase.com/1f37a-unicode-beer-mug @@ -142,7 +142,7 @@ class StringValueParsingUnicodeTest extends Specification { def input = '''"hello \\u{d83c}\\u{df7a}"''' when: - String parsed = StringValueParsing.parseSingleQuotedString(input, null) + String parsed = StringValueParsing.parseSingleQuotedString(input) then: parsed == '''hello 🍺''' // contains the beer icon U+1F37A : http://www.charbase.com/1f37a-unicode-beer-mug @@ -153,7 +153,7 @@ class StringValueParsingUnicodeTest extends Specification { def input = '''"hello \\u{d83c}\\"''' when: - StringValueParsing.parseSingleQuotedString(input, null) + StringValueParsing.parseSingleQuotedString(input) then: InvalidSyntaxException e = thrown(InvalidSyntaxException) @@ -165,7 +165,7 @@ class StringValueParsingUnicodeTest extends Specification { def input = '''"hello \\u{d83c}\\u"''' when: - StringValueParsing.parseSingleQuotedString(input, null) + StringValueParsing.parseSingleQuotedString(input) then: InvalidSyntaxException e = thrown(InvalidSyntaxException) @@ -177,7 +177,7 @@ class StringValueParsingUnicodeTest extends Specification { def input = '''"hello \\u{d83c}\\u{df7a"''' when: - StringValueParsing.parseSingleQuotedString(input, null) + StringValueParsing.parseSingleQuotedString(input) then: InvalidSyntaxException e = thrown(InvalidSyntaxException) @@ -189,7 +189,7 @@ class StringValueParsingUnicodeTest extends Specification { def input = '''"hello \\u{d83c}{df7a}"''' when: - StringValueParsing.parseSingleQuotedString(input, null) + StringValueParsing.parseSingleQuotedString(input) then: InvalidSyntaxException e = thrown(InvalidSyntaxException) @@ -201,7 +201,7 @@ class StringValueParsingUnicodeTest extends Specification { def input = '''"hello \\u{d83c}df7a"''' when: - StringValueParsing.parseSingleQuotedString(input, null) + StringValueParsing.parseSingleQuotedString(input) then: InvalidSyntaxException e = thrown(InvalidSyntaxException) @@ -213,7 +213,7 @@ class StringValueParsingUnicodeTest extends Specification { def input = '''"hello d83c\\u{df7a}"''' when: - StringValueParsing.parseSingleQuotedString(input, null) + StringValueParsing.parseSingleQuotedString(input) then: InvalidSyntaxException e = thrown(InvalidSyntaxException) @@ -225,7 +225,7 @@ class StringValueParsingUnicodeTest extends Specification { def input = '''"\\u{5B57}\\uDC00"''' when: - StringValueParsing.parseSingleQuotedString(input, null) + StringValueParsing.parseSingleQuotedString(input) then: InvalidSyntaxException e = thrown(InvalidSyntaxException) @@ -237,7 +237,7 @@ class StringValueParsingUnicodeTest extends Specification { def input = '''"\\uD83D\\u{DBFF}"''' when: - StringValueParsing.parseSingleQuotedString(input, null) + StringValueParsing.parseSingleQuotedString(input) then: InvalidSyntaxException e = thrown(InvalidSyntaxException) @@ -249,7 +249,7 @@ class StringValueParsingUnicodeTest extends Specification { def input = '''"\\u{fffffff}"''' when: - StringValueParsing.parseSingleQuotedString(input, null) + StringValueParsing.parseSingleQuotedString(input) then: InvalidSyntaxException e = thrown(InvalidSyntaxException) From 85a5234698469ead601cda52e0344b4d2d2fc99d Mon Sep 17 00:00:00 2001 From: dondonz <13839920+dondonz@users.noreply.github.com> Date: Wed, 14 Jul 2021 15:41:57 +1000 Subject: [PATCH 14/15] Add Parser tests with SourceLocation in exception message --- .../groovy/graphql/parser/ParserTest.groovy | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/src/test/groovy/graphql/parser/ParserTest.groovy b/src/test/groovy/graphql/parser/ParserTest.groovy index c39da47874..dc92da158f 100644 --- a/src/test/groovy/graphql/parser/ParserTest.groovy +++ b/src/test/groovy/graphql/parser/ParserTest.groovy @@ -1012,4 +1012,36 @@ triple3 : """edge cases \\""" "" " \\"" \\" edge cases""" then: argValue.getValue() == "🍺" // contains the beer icon U+1F37 A : http://www.charbase.com/1f37a-unicode-beer-mug } + + def "invalid surrogate pair - no trailing value"() { + given: + def input = ''' + { + foo(arg: "\\ud83c") + } + ''' + + when: + Parser.parse(input) + + then: + InvalidSyntaxException e = thrown(InvalidSyntaxException) + e.message == "Invalid Syntax : Invalid unicode - leading surrogate must be followed by a trailing surrogate - offending token '\\ud83c' at line 3 column 24" + } + + def "invalid surrogate pair - no leading value"() { + given: + def input = ''' + { + foo(arg: "\\uDC00") + } + ''' + + when: + Parser.parse(input) + + then: + InvalidSyntaxException e = thrown(InvalidSyntaxException) + e.message == "Invalid Syntax : Invalid unicode - trailing surrogate must be preceded with a leading surrogate - offending token '\\uDC00' at line 3 column 24" + } } From b60d28ab8963286758547c3fedacb9874814841f Mon Sep 17 00:00:00 2001 From: dondonz <13839920+dondonz@users.noreply.github.com> Date: Wed, 14 Jul 2021 15:54:30 +1000 Subject: [PATCH 15/15] Add full invalid Unicode surrogate test --- src/test/groovy/graphql/GraphQLTest.groovy | 25 ++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/src/test/groovy/graphql/GraphQLTest.groovy b/src/test/groovy/graphql/GraphQLTest.groovy index 2e4f7b9a7c..dcf75a764f 100644 --- a/src/test/groovy/graphql/GraphQLTest.groovy +++ b/src/test/groovy/graphql/GraphQLTest.groovy @@ -179,6 +179,31 @@ class GraphQLTest extends Specification { errors[0].locations == [new SourceLocation(1, 8)] } + def "query with invalid Unicode surrogate in argument - no trailing value"() { + given: + GraphQLSchema schema = newSchema().query( + newObject() + .name("RootQueryType") + .field(newFieldDefinition() + .name("field") + .type(GraphQLString) + .argument(newArgument() + .name("arg") + .type(GraphQLNonNull.nonNull(GraphQLString)))) + .build() + ).build() + + when: + // Invalid Unicode character - leading surrogate value without trailing surrogate value + def errors = GraphQL.newGraphQL(schema).build().execute('{ hello(arg:"\\ud83c") }').errors + + then: + errors.size() == 1 + errors[0].errorType == ErrorType.InvalidSyntax + errors[0].message == "Invalid Syntax : Invalid unicode - leading surrogate must be followed by a trailing surrogate - offending token '\\ud83c' at line 1 column 13" + errors[0].locations == [new SourceLocation(1, 13)] + } + def "non null argument is missing"() { given: GraphQLSchema schema = newSchema().query(