Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support full unicode in parser #2404

Merged
merged 16 commits into from Jul 14, 2021
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/main/antlr/GraphqlCommon.g4
Expand Up @@ -120,12 +120,12 @@ fragment BlockStringCharacter:
ExtendedSourceCharacter;

fragment StringCharacter:
([\u0009\u0020\u0021] | [\u0023-\u005b] | [\u005d-\u{10FFFF}]) | // this is SoureCharacter without '"' and '\'
([\u0009\u0020\u0021] | [\u0023-\u005b] | [\u005d-\u{10FFFF}]) | // this is SourceCharacter without '"' and '\'
dondonz marked this conversation as resolved.
Show resolved Hide resolved
'\\u' EscapedUnicode |
'\\' EscapedCharacter;

fragment EscapedCharacter : ["\\/bfnrt];
fragment EscapedUnicode : Hex Hex Hex Hex;
fragment EscapedUnicode : Hex Hex Hex Hex | '{' Hex+ '}';
fragment Hex : [0-9a-fA-F];


Expand Down
13 changes: 7 additions & 6 deletions src/main/java/graphql/parser/StringValueParsing.java
Expand Up @@ -30,7 +30,9 @@ public static String removeIndentation(String rawValue) {
String[] lines = rawValue.split("\\n");
Integer commonIndent = null;
for (int i = 0; i < lines.length; i++) {
if (i == 0) continue;
if (i == 0) {
continue;
}
String line = lines[i];
int length = line.length();
int indent = leadingWhitespace(line);
Expand All @@ -44,7 +46,9 @@ public static String removeIndentation(String rawValue) {
if (commonIndent != null) {
for (int i = 0; i < lineList.size(); i++) {
String line = lineList.get(i);
if (i == 0) continue;
if (i == 0) {
continue;
}
if (line.length() > commonIndent) {
line = line.substring(commonIndent);
lineList.set(i, line);
Expand Down Expand Up @@ -135,10 +139,7 @@ public static String parseSingleQuotedString(String string) {
writer.write('\t');
continue;
case 'u':
String hexStr = string.substring(i + 1, i + 5);
int codepoint = Integer.parseInt(hexStr, 16);
i += 4;
writer.write(codepoint);
i = UnicodeUtil.parseAndWriteUnicode(writer, string, i);
continue;
default:
Assert.assertShouldNeverHappen();
Expand Down
64 changes: 64 additions & 0 deletions src/main/java/graphql/parser/UnicodeUtil.java
@@ -0,0 +1,64 @@
package graphql.parser;

import graphql.Assert;
import graphql.Internal;

import java.io.IOException;
import java.io.StringWriter;

/**
* Contains Unicode helpers for parsing StringValue types in the grammar
*/
@Internal
public class UnicodeUtil {
public static int MAX_UNICODE_CODE_POINT = 0x10FFFF;

public static int parseAndWriteUnicode(StringWriter writer, String string, int i) {
// Unicode characters can either be:
// - four hex characters in the form \\u597D, or
// - any number of hex characters surrounded by a brace in the form \\u{1F37A}

// Four hex character only case \\u597D, for code points in the Basic Multilingual Plane (BMP)
if (isNotBracedEscape(string, i)) {
String hexStr = string.substring(i + 1, i + 5);
int codepoint = Integer.parseInt(hexStr, 16);
writer.write(codepoint);
return i + 4;
// TODO error checking of invalid values
} else {
// Any number of hex characters e.g. \\u{1F37A}, which allows code points outside the Basic Multilingual Plane (BMP)
int startIx = i + 2;
int endIndexExclusive = startIx;
do {
if (endIndexExclusive + 1 >= string.length()) {
throw new RuntimeException("invalid unicode encoding");
dondonz marked this conversation as resolved.
Show resolved Hide resolved
}
} while (string.charAt(++endIndexExclusive) != '}');

String hexStr = string.substring(startIx, endIndexExclusive);
Integer hexValue = Integer.parseInt(hexStr, 16);
if (isValidUnicodeCodePoint(hexValue)) {
char[] chars = Character.toChars(hexValue);
try {
writer.write(chars);
} catch (IOException e) {
return Assert.assertShouldNeverHappen();
}
return endIndexExclusive;
} else {
throw new RuntimeException("invalid unicode code point");
dondonz marked this conversation as resolved.
Show resolved Hide resolved
}
}
// Assert.assertShouldNeverHappen();
// TODO error checking of invalid values
dondonz marked this conversation as resolved.
Show resolved Hide resolved
}

private static boolean isNotBracedEscape(String string, int i) {
return string.charAt(i + 1) != '{';
}

private static boolean isValidUnicodeCodePoint(Integer value) {
// TODO: Add bad surrogate checks
return value <= MAX_UNICODE_CODE_POINT;
}
}
10 changes: 4 additions & 6 deletions src/test/groovy/graphql/parser/StringValueParsingTest.groovy
Expand Up @@ -40,8 +40,7 @@ class StringValueParsingTest extends Specification {
parsed == '''"'''
}

def "parsing emoji should work"() {
// needs surrogate pairs for this emoji
def "parsing beer stein as surrogate pair should work"() {
given:
def input = '''"\\ud83c\\udf7a"'''

Expand All @@ -52,18 +51,17 @@ class StringValueParsingTest extends Specification {
parsed == '''🍺''' // contains the beer icon U+1F37A : http://www.charbase.com/1f37a-unicode-beer-mug
}

def "parsing simple unicode should work"() {
def "parsing simple unicode should work - Basic Multilingual Plane (BMP)"() {
given:
def input = '''"\\u56fe"'''
def input = '''"\\u5564\\u9152"'''

when:
String parsed = StringValueParsing.parseSingleQuotedString(input)

then:
parsed == ''''''
parsed == '''啤酒'''
}


def "parsing triple quoted string should work"() {
given:
def input = '''"""triple quoted"""'''
Expand Down