Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #2404 from dondonz/unicode-full-range
Support full unicode in parser
- Loading branch information
Showing
9 changed files
with
504 additions
and
32 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,114 @@ | ||
package graphql.parser; | ||
|
||
import graphql.Internal; | ||
import graphql.language.SourceLocation; | ||
|
||
import java.io.IOException; | ||
import java.io.StringWriter; | ||
|
||
import static graphql.Assert.assertShouldNeverHappen; | ||
|
||
/** | ||
* Contains Unicode helpers for parsing StringValue types in the grammar | ||
*/ | ||
@Internal | ||
public class UnicodeUtil { | ||
public static int MAX_UNICODE_CODE_POINT = 0x10FFFF; | ||
public static int LEADING_SURROGATE_LOWER_BOUND = 0xD800; | ||
public static int LEADING_SURROGATE_UPPER_BOUND = 0xDBFF; | ||
public static int TRAILING_SURROGATE_LOWER_BOUND = 0xDC00; | ||
public static int TRAILING_SURROGATE_UPPER_BOUND = 0xDFFF; | ||
|
||
public static int parseAndWriteUnicode(StringWriter writer, String string, int i, SourceLocation sourceLocation) { | ||
// Unicode code points can either be: | ||
// 1. Unbraced: four hex characters in the form \\u597D, or | ||
// 2. Braced: any number of hex characters surrounded by braces in the form \\u{1F37A} | ||
|
||
// Extract the code point hex digits. Index i points to 'u' | ||
int startIndex = isBracedEscape(string, i) ? i + 2 : i + 1; | ||
int endIndexExclusive = getEndIndexExclusive(string, i, sourceLocation); | ||
// Index for parser to continue at, the last character of the escaped unicode character. Either } or hex digit | ||
int continueIndex = isBracedEscape(string, i) ? endIndexExclusive : endIndexExclusive - 1; | ||
|
||
String hexStr = string.substring(startIndex, endIndexExclusive); | ||
Integer codePoint = Integer.parseInt(hexStr, 16); | ||
|
||
if (isTrailingSurrogateValue(codePoint)) { | ||
throw new InvalidSyntaxException(sourceLocation, "Invalid unicode - trailing surrogate must be preceded with a leading surrogate -", null, string.substring(i - 1, continueIndex + 1), null); | ||
} else if (isLeadingSurrogateValue(codePoint)) { | ||
if (!isEscapedUnicode(string, continueIndex + 1)) { | ||
throw new InvalidSyntaxException(sourceLocation, "Invalid unicode - leading surrogate must be followed by a trailing surrogate -", null, string.substring(i - 1, continueIndex + 1), null); | ||
} | ||
|
||
// Shift parser ahead to 'u' in second escaped Unicode character | ||
i = continueIndex + 2; | ||
int trailingStartIndex = isBracedEscape(string, i) ? i + 2 : i + 1; | ||
int trailingEndIndexExclusive = getEndIndexExclusive(string, i, sourceLocation); | ||
String trailingHexStr = string.substring(trailingStartIndex, trailingEndIndexExclusive); | ||
Integer trailingCodePoint = Integer.parseInt(trailingHexStr, 16); | ||
continueIndex = isBracedEscape(string, i) ? trailingEndIndexExclusive : trailingEndIndexExclusive - 1; | ||
|
||
if (isTrailingSurrogateValue(trailingCodePoint)) { | ||
writeCodePoint(writer, codePoint); | ||
writeCodePoint(writer, trailingCodePoint); | ||
return continueIndex; | ||
} | ||
|
||
throw new InvalidSyntaxException(sourceLocation, "Invalid unicode - leading surrogate must be followed by a trailing surrogate -", null, string.substring(i - 1, continueIndex + 1), null); | ||
} else if (isValidUnicodeCodePoint(codePoint)) { | ||
writeCodePoint(writer, codePoint); | ||
return continueIndex; | ||
} | ||
|
||
throw new InvalidSyntaxException(sourceLocation, "Invalid unicode - not a valid code point -", null, string.substring(i - 1, continueIndex + 1), null); | ||
} | ||
|
||
private static int getEndIndexExclusive(String string, int i, SourceLocation sourceLocation) { | ||
// Unbraced case, with exactly 4 hex digits | ||
if (string.length() > i + 5 && !isBracedEscape(string, i)) { | ||
return i + 5; | ||
} | ||
|
||
// Braced case, with any number of hex digits | ||
int endIndexExclusive = i + 2; | ||
do { | ||
if (endIndexExclusive + 1 >= string.length()) { | ||
throw new InvalidSyntaxException(sourceLocation, "Invalid unicode - incorrectly formatted escape -", null, string.substring(i - 1, endIndexExclusive), null); | ||
} | ||
} while (string.charAt(++endIndexExclusive) != '}'); | ||
|
||
return endIndexExclusive; | ||
} | ||
|
||
private static boolean isValidUnicodeCodePoint(int value) { | ||
return value <= MAX_UNICODE_CODE_POINT; | ||
} | ||
|
||
private static boolean isEscapedUnicode(String string, int index) { | ||
if (index + 1 >= string.length()) { | ||
return false; | ||
} | ||
return string.charAt(index) == '\\' && string.charAt(index + 1) == 'u'; | ||
} | ||
|
||
private static boolean isLeadingSurrogateValue(int value) { | ||
return LEADING_SURROGATE_LOWER_BOUND <= value && value <= LEADING_SURROGATE_UPPER_BOUND; | ||
} | ||
|
||
private static boolean isTrailingSurrogateValue(int value) { | ||
return TRAILING_SURROGATE_LOWER_BOUND <= value && value <= TRAILING_SURROGATE_UPPER_BOUND; | ||
} | ||
|
||
private static void writeCodePoint(StringWriter writer, int codepoint) { | ||
char[] chars = Character.toChars(codepoint); | ||
try { | ||
writer.write(chars); | ||
} catch (IOException e) { | ||
assertShouldNeverHappen(); | ||
} | ||
} | ||
|
||
private static boolean isBracedEscape(String string, int i) { | ||
return string.charAt(i + 1) == '{'; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.