Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support full unicode in parser #2404

Merged
merged 16 commits into from Jul 14, 2021
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/main/antlr/GraphqlCommon.g4
Expand Up @@ -120,12 +120,12 @@ fragment BlockStringCharacter:
ExtendedSourceCharacter;

fragment StringCharacter:
([\u0009\u0020\u0021] | [\u0023-\u005b] | [\u005d-\u{10FFFF}]) | // this is SoureCharacter without '"' and '\'
([\u0009\u0020\u0021] | [\u0023-\u005b] | [\u005d-\u{10FFFF}]) | // this is SourceCharacter without '"' and '\'
dondonz marked this conversation as resolved.
Show resolved Hide resolved
'\\u' EscapedUnicode |
'\\' EscapedCharacter;

fragment EscapedCharacter : ["\\/bfnrt];
fragment EscapedUnicode : Hex Hex Hex Hex;
fragment EscapedUnicode : Hex Hex Hex Hex | '{' Hex+ '}';
fragment Hex : [0-9a-fA-F];


Expand Down
13 changes: 7 additions & 6 deletions src/main/java/graphql/parser/StringValueParsing.java
Expand Up @@ -30,7 +30,9 @@ public static String removeIndentation(String rawValue) {
String[] lines = rawValue.split("\\n");
Integer commonIndent = null;
for (int i = 0; i < lines.length; i++) {
if (i == 0) continue;
if (i == 0) {
continue;
}
String line = lines[i];
int length = line.length();
int indent = leadingWhitespace(line);
Expand All @@ -44,7 +46,9 @@ public static String removeIndentation(String rawValue) {
if (commonIndent != null) {
for (int i = 0; i < lineList.size(); i++) {
String line = lineList.get(i);
if (i == 0) continue;
if (i == 0) {
continue;
}
if (line.length() > commonIndent) {
line = line.substring(commonIndent);
lineList.set(i, line);
Expand Down Expand Up @@ -135,10 +139,7 @@ public static String parseSingleQuotedString(String string) {
writer.write('\t');
continue;
case 'u':
String hexStr = string.substring(i + 1, i + 5);
int codepoint = Integer.parseInt(hexStr, 16);
i += 4;
writer.write(codepoint);
i = UnicodeUtil.parseAndWriteUnicode(writer, string, i);
continue;
default:
Assert.assertShouldNeverHappen();
Expand Down
113 changes: 113 additions & 0 deletions src/main/java/graphql/parser/UnicodeUtil.java
@@ -0,0 +1,113 @@
package graphql.parser;

import graphql.Internal;

import java.io.IOException;
import java.io.StringWriter;

import static graphql.Assert.assertShouldNeverHappen;

/**
* Contains Unicode helpers for parsing StringValue types in the grammar
*/
@Internal
public class UnicodeUtil {
public static int MAX_UNICODE_CODE_POINT = 0x10FFFF;
public static int LEADING_SURROGATE_LOWER_BOUND = 0xD800;
public static int LEADING_SURROGATE_UPPER_BOUND = 0xDBFF;
public static int TRAILING_SURROGATE_LOWER_BOUND = 0xDC00;
public static int TRAILING_SURROGATE_UPPER_BOUND = 0xDFFF;

public static int parseAndWriteUnicode(StringWriter writer, String string, int i) {
// Unicode code points can either be:
// 1. Unbraced: four hex characters in the form \\u597D, or
// 2. Braced: any number of hex characters surrounded by braces in the form \\u{1F37A}

// Extract the code point hex digits. Index i points to 'u'
int startIndex = isBracedEscape(string, i) ? i + 2 : i + 1;
int endIndexExclusive = getEndIndexExclusive(string, i);
// Index for parser to continue at, the last character of the escaped unicode character. Either } or hex digit
int continueIndex = isBracedEscape(string, i) ? endIndexExclusive : endIndexExclusive - 1;

String hexStr = string.substring(startIndex, endIndexExclusive);
Integer codePoint = Integer.parseInt(hexStr, 16);

if (isTrailingSurrogateValue(codePoint)) {
throw new InvalidSyntaxException(null, "Invalid unicode - trailing surrogate must be preceded with a leading surrogate -", null, string.substring(i - 1, continueIndex + 1), null);
} else if (isLeadingSurrogateValue(codePoint)) {
if (!isEscapedUnicode(string, continueIndex + 1)) {
throw new InvalidSyntaxException(null, "Invalid unicode - leading surrogate must be followed by a trailing surrogate -", null, string.substring(i - 1, continueIndex + 1), null);
}

// Shift parser ahead to 'u' in second escaped Unicode character
i = continueIndex + 2;
int trailingStartIndex = isBracedEscape(string, i) ? i + 2 : i + 1;
int trailingEndIndexExclusive = getEndIndexExclusive(string, i);
String trailingHexStr = string.substring(trailingStartIndex, trailingEndIndexExclusive);
Integer trailingCodePoint = Integer.parseInt(trailingHexStr, 16);
continueIndex = isBracedEscape(string, i) ? trailingEndIndexExclusive : trailingEndIndexExclusive - 1;

if (isTrailingSurrogateValue(trailingCodePoint)) {
writeCodePoint(writer, codePoint);
writeCodePoint(writer, trailingCodePoint);
return continueIndex;
}

throw new InvalidSyntaxException(null, "Invalid unicode - leading surrogate must be followed by a trailing surrogate -", null, string.substring(i - 1, continueIndex + 1), null);
} else if (isValidUnicodeCodePoint(codePoint)) {
writeCodePoint(writer, codePoint);
return continueIndex;
}

throw new InvalidSyntaxException(null, "Invalid unicode - not a valid code point -", null, string.substring(i - 1, continueIndex + 1), null);
}

private static int getEndIndexExclusive(String string, int i) {
// Unbraced case, with exactly 4 hex digits
if (string.length() > i + 5 && !isBracedEscape(string, i)) {
return i + 5;
}

// Braced case, with any number of hex digits
int endIndexExclusive = i + 2;
do {
if (endIndexExclusive + 1 >= string.length()) {
throw new InvalidSyntaxException(null, "Invalid unicode - incorrectly formatted escape -", null, string.substring(i - 1, endIndexExclusive), null);
}
} while (string.charAt(++endIndexExclusive) != '}');

return endIndexExclusive;
}

private static boolean isValidUnicodeCodePoint(int value) {
return value <= MAX_UNICODE_CODE_POINT;
}

private static boolean isEscapedUnicode(String string, int index) {
if (index + 1 >= string.length()) {
return false;
}
return string.charAt(index) == '\\' && string.charAt(index + 1) == 'u';
}

private static boolean isLeadingSurrogateValue(int value) {
return LEADING_SURROGATE_LOWER_BOUND <= value && value <= LEADING_SURROGATE_UPPER_BOUND;
}

private static boolean isTrailingSurrogateValue(int value) {
return TRAILING_SURROGATE_LOWER_BOUND <= value && value <= TRAILING_SURROGATE_UPPER_BOUND;
}

private static void writeCodePoint(StringWriter writer, int codepoint) {
char[] chars = Character.toChars(codepoint);
try {
writer.write(chars);
} catch (IOException e) {
assertShouldNeverHappen();
}
}

private static boolean isBracedEscape(String string, int i) {
return string.charAt(i + 1) == '{';
}
}
10 changes: 4 additions & 6 deletions src/test/groovy/graphql/parser/StringValueParsingTest.groovy
Expand Up @@ -40,8 +40,7 @@ class StringValueParsingTest extends Specification {
parsed == '''"'''
}

def "parsing emoji should work"() {
// needs surrogate pairs for this emoji
def "parsing beer stein as surrogate pair should work"() {
given:
def input = '''"\\ud83c\\udf7a"'''

Expand All @@ -52,18 +51,17 @@ class StringValueParsingTest extends Specification {
parsed == '''🍺''' // contains the beer icon U+1F37A : http://www.charbase.com/1f37a-unicode-beer-mug
}

def "parsing simple unicode should work"() {
def "parsing simple unicode should work - Basic Multilingual Plane (BMP)"() {
given:
def input = '''"\\u56fe"'''
def input = '''"\\u5564\\u9152"'''

when:
String parsed = StringValueParsing.parseSingleQuotedString(input)

then:
parsed == ''''''
parsed == '''啤酒'''
}


def "parsing triple quoted string should work"() {
given:
def input = '''"""triple quoted"""'''
Expand Down