Skip to content

Commit

Permalink
SI-12290: support JDK15 text blocks in Java parser
Browse files Browse the repository at this point in the history
JDK15 introduced text blocks (JEP 378) for writing multiline strings.
This adds support for parsing these strings in the Java parser.

The logic for interpretting the literals is a little complicated, but
follows from the "3.10.6. Text Blocks" of the Java language specification.
The test cases include examples from there and from the JEP.
  • Loading branch information
harpocrates authored and lrytz committed Apr 23, 2021
1 parent d112b8c commit a8225a0
Show file tree
Hide file tree
Showing 7 changed files with 357 additions and 24 deletions.
185 changes: 161 additions & 24 deletions src/compiler/scala/tools/nsc/javac/JavaScanners.scala
Expand Up @@ -239,6 +239,9 @@ trait JavaScanners extends ast.parser.ScannersCommon {
*/
protected def putChar(c: Char): Unit = { cbuf.append(c) }

/** Remove the last N characters from the buffer */
private def popNChars(n: Int): Unit = if (n > 0) cbuf.setLength(cbuf.length - n)

/** Clear buffer and set name */
private def setName(): Unit = {
name = newTermName(cbuf.toString())
Expand Down Expand Up @@ -322,15 +325,26 @@ trait JavaScanners extends ast.parser.ScannersCommon {

case '\"' =>
in.next()
while (in.ch != '\"' && (in.isUnicode || in.ch != CR && in.ch != LF && in.ch != SU)) {
getlitch()
}
if (in.ch == '\"') {
token = STRINGLIT
setName()
in.next()
if (in.ch != '\"') { // "..." non-empty string literal
while (in.ch != '\"' && (in.isUnicode || in.ch != CR && in.ch != LF && in.ch != SU)) {
getlitch()
}
if (in.ch == '\"') {
token = STRINGLIT
setName()
in.next()
} else {
syntaxError("unclosed string literal")
}
} else {
syntaxError("unclosed string literal")
in.next()
if (in.ch != '\"') { // "" empty string literal
token = STRINGLIT
setName()
} else {
in.next()
getTextBlock()
}
}
return

Expand Down Expand Up @@ -664,9 +678,12 @@ trait JavaScanners extends ast.parser.ScannersCommon {
// Literals -----------------------------------------------------------------

/** read next character in character or string literal:
*/
protected def getlitch() =
if (in.ch == '\\') {
*
* @param scanOnly skip emitting errors or adding to the literal buffer
* @param inTextBlock is this for a text block?
*/
protected def getlitch(scanOnly: Boolean = false, inTextBlock: Boolean = false): Unit = {
val c: Char = if (in.ch == '\\') {
in.next()
if ('0' <= in.ch && in.ch <= '7') {
val leadch: Char = in.ch
Expand All @@ -680,27 +697,147 @@ trait JavaScanners extends ast.parser.ScannersCommon {
in.next()
}
}
putChar(oct.asInstanceOf[Char])
oct.asInstanceOf[Char]
} else {
in.ch match {
case 'b' => putChar('\b')
case 't' => putChar('\t')
case 'n' => putChar('\n')
case 'f' => putChar('\f')
case 'r' => putChar('\r')
case '\"' => putChar('\"')
case '\'' => putChar('\'')
case '\\' => putChar('\\')
val c: Char = in.ch match {
case 'b' => '\b'
case 's' => ' '
case 't' => '\t'
case 'n' => '\n'
case 'f' => '\f'
case 'r' => '\r'
case '\"' => '\"'
case '\'' => '\''
case '\\' => '\\'
case CR | LF if inTextBlock =>
in.next()
return
case _ =>
syntaxError(in.cpos - 1, "invalid escape character")
putChar(in.ch)
if (!scanOnly) syntaxError(in.cpos - 1, "invalid escape character")
in.ch
}
in.next()
c
}
} else {
putChar(in.ch)
val c = in.ch
in.next()
c
}
if (!scanOnly) putChar(c)
}

/** read a triple-quote delimited text block, starting after the first three
* double quotes
*/
private def getTextBlock(): Unit = {
// Open delimiter is followed by optional space, then a newline
while (in.ch == ' ' || in.ch == '\t' || in.ch == FF) {
in.next()
}
if (in.ch != LF && in.ch != CR) { // CR-LF is already normalized into LF by `JavaCharArrayReader`
syntaxError("illegal text block open delimiter sequence, missing line terminator")
return
}
in.next()

/* Do a lookahead scan over the full text block to:
* - compute common white space prefix
* - find the offset where the text block ends
*/
var commonWhiteSpacePrefix = Int.MaxValue
var blockEndOffset = 0
val backtrackTo = in.copy
var blockClosed = false
var lineWhiteSpacePrefix = 0
var lineIsOnlyWhitespace = true
while (!blockClosed && (in.isUnicode || in.ch != SU)) {
if (in.ch == '\"') { // Potential end of the block
in.next()
if (in.ch == '\"') {
in.next()
if (in.ch == '\"') {
blockClosed = true
commonWhiteSpacePrefix = commonWhiteSpacePrefix min lineWhiteSpacePrefix
blockEndOffset = in.cpos - 2
}
}

// Not the end of the block - just a single or double " character
if (!blockClosed) {
lineIsOnlyWhitespace = false
}
} else if (in.ch == CR || in.ch == LF) { // new line in the block
in.next()
if (!lineIsOnlyWhitespace) {
commonWhiteSpacePrefix = commonWhiteSpacePrefix min lineWhiteSpacePrefix
}
lineWhiteSpacePrefix = 0
lineIsOnlyWhitespace = true
} else if (lineIsOnlyWhitespace && Character.isWhitespace(in.ch)) { // extend white space prefix
in.next()
lineWhiteSpacePrefix += 1
} else {
lineIsOnlyWhitespace = false
getlitch(scanOnly = true, inTextBlock = true)
}
}

// Bail out if the block never did have an end
if (!blockClosed) {
syntaxError("unclosed text block")
return
}

// Second pass: construct the literal string value this time
in = backtrackTo
while (in.cpos < blockEndOffset) {
// Drop the line's leading whitespace
var remainingPrefix = commonWhiteSpacePrefix
while (remainingPrefix > 0 && in.ch != CR && in.ch != LF && in.cpos < blockEndOffset) {
in.next()
remainingPrefix -= 1
}

var trailingWhitespaceLength = 0
var escapedNewline = false // Does the line end with `\`?
while (in.ch != CR && in.ch != LF && in.cpos < blockEndOffset && !escapedNewline) {
if (Character.isWhitespace(in.ch)) {
trailingWhitespaceLength += 1
} else {
trailingWhitespaceLength = 0
}

// Detect if the line is about to end with `\`
if (in.ch == '\\' && {
val lookahead = in.copy
lookahead.next()
lookahead.ch == CR || lookahead.ch == LF
}) {
escapedNewline = true
}

getlitch(scanOnly = false, inTextBlock = true)
}

// Drop the line's trailing whitespace
popNChars(trailingWhitespaceLength)

// Normalize line terminators
if ((in.ch == CR || in.ch == LF) && !escapedNewline) {
in.next()
putChar('\n')
}
}

token = STRINGLIT
setName()

// Trailing """
in.next()
in.next()
in.next()
}

/** read fractional part and exponent of floating point number
* if one is present.
Expand Down
13 changes: 13 additions & 0 deletions test/files/neg/text-blocks.check
@@ -0,0 +1,13 @@
text-blocks/Invalid1.java:4: error: illegal text block open delimiter sequence, missing line terminator
public static final String badOpeningDelimiter = """non-whitespace
^
text-blocks/Invalid1.java:4: error: <identifier> expected
public static final String badOpeningDelimiter = """non-whitespace
^
text-blocks/Invalid1.java:6: error: illegal text block open delimiter sequence, missing line terminator
""";
^
text-blocks/Invalid2.java:6: error: unclosed string literal
foo"""";
^
4 errors
7 changes: 7 additions & 0 deletions test/files/neg/text-blocks/Invalid1.java
@@ -0,0 +1,7 @@
// javaVersion: 15+
class Invalid1 {

public static final String badOpeningDelimiter = """non-whitespace
foo
""";
}
7 changes: 7 additions & 0 deletions test/files/neg/text-blocks/Invalid2.java
@@ -0,0 +1,7 @@
// javaVersion: 15+
class Invalid2 {

// Closing delimiter is first three eligible `"""`, not last
public static final String closingDelimiterIsNotScalas = """
foo"""";
}
61 changes: 61 additions & 0 deletions test/files/run/t12290.check
@@ -0,0 +1,61 @@
====
A text

====
<html>
<body>
<p>Hello, world</p>
</body>
</html>

====
SELECT "EMP_ID", "LAST_NAME" FROM "EMPLOYEE_TB"
WHERE "CITY" = 'INDIANAPOLIS'
ORDER BY "EMP_ID", "LAST_NAME";

====
<html>
<body>
<p>Hello, world</p>
</body>
</html>

====
<html>
<body>
<p>Hello, world</p>
</body>
</html>

====
<html>
<body>
<p>Hello, world</p>
</body>

</html>

====
<html>

<body> <p>Hello , world</p>
</body>
</html>

====
this line has 4 tabs before it
this line has 5 spaces before it and space after it
this line has 2 tabs and 3 spaces before it
 this line has 6 spaces before it

====
String text = """
A text block inside a text block
""";

====
foo bar
baz
====

====
30 changes: 30 additions & 0 deletions test/files/run/t12290/Test.scala
@@ -0,0 +1,30 @@
// javaVersion: 15+
/* Using `valueOf` is a way to check that the Java string literals were properly
* parsed, since the parsed value is what the Scala compiler will use when
* resolving the singleton types
*/
object Test extends App {
println("====")
println(valueOf[TextBlocks.aText.type])
println("====")
println(valueOf[TextBlocks.html1.type])
println("====")
println(valueOf[TextBlocks.query.type])
println("====")
println(valueOf[TextBlocks.html2.type])
println("====")
println(valueOf[TextBlocks.html3.type])
println("====")
println(valueOf[TextBlocks.html4.type])
println("====")
println(valueOf[TextBlocks.html5.type])
println("====")
println(valueOf[TextBlocks.mixedIndents.type])
println("====")
println(valueOf[TextBlocks.code.type])
println("====")
println(valueOf[TextBlocks.simpleString.type])
println("====")
println(valueOf[TextBlocks.emptyString.type])
println("====")
}

0 comments on commit a8225a0

Please sign in to comment.