Skip to content

Commit

Permalink
SI-12290: support JDK15 text blocks in Java parser
Browse files Browse the repository at this point in the history
JDK15 introduced text blocks (JEP 378) for writing multiline strings.
This adds support for parsing these strings in the Java parser.

The logic for interpretting the literals is a little complicated, but
follows from the "3.10.6. Text Blocks" of the Java language specification.
The test cases include examples from there and from the JEP.

Fixes scala/bug#12290
  • Loading branch information
harpocrates committed Mar 24, 2021
1 parent 8a2cf63 commit af3e6fd
Show file tree
Hide file tree
Showing 4 changed files with 275 additions and 8 deletions.
132 changes: 124 additions & 8 deletions src/compiler/scala/tools/nsc/javac/JavaScanners.scala
Expand Up @@ -239,6 +239,9 @@ trait JavaScanners extends ast.parser.ScannersCommon {
*/
protected def putChar(c: Char): Unit = { cbuf.append(c) }

/** Remove the last N characters from the buffer */
private def popNChars(n: Int): Unit = if (n > 0) cbuf.setLength(cbuf.length - n)

/** Clear buffer and set name */
private def setName(): Unit = {
name = newTermName(cbuf.toString())
Expand Down Expand Up @@ -322,15 +325,26 @@ trait JavaScanners extends ast.parser.ScannersCommon {

case '\"' =>
in.next()
while (in.ch != '\"' && (in.isUnicode || in.ch != CR && in.ch != LF && in.ch != SU)) {
getlitch()
}
if (in.ch == '\"') {
token = STRINGLIT
setName()
in.next()
if (in.ch != '\"') { // "..." non-empty string literal
while (in.ch != '\"' && (in.isUnicode || in.ch != CR && in.ch != LF && in.ch != SU)) {
getlitch()
}
if (in.ch == '\"') {
token = STRINGLIT
setName()
in.next()
} else {
syntaxError("unclosed string literal")
}
} else {
syntaxError("unclosed string literal")
in.next()
if (in.ch != '\"') { // "" empty string literal
token = STRINGLIT
setName()
} else {
in.next()
getTextBlock()
}
}
return

Expand Down Expand Up @@ -702,6 +716,108 @@ trait JavaScanners extends ast.parser.ScannersCommon {
in.next()
}

/** read a triple-quote delimited text block, starting after the first three
* double quotes
*/
private def getTextBlock(): Unit = {
// Open delimiter is followed by optional space, then a newline
while (in.ch = ' ' || in.ch == '\t' || in.ch == FF) {
in.next()
}
if (in.ch != LF && in.ch != CR) { // CR-LF is already normalized into LF by `JavaCharArrayReader`
syntaxError("illegal text block open delimiter sequence, missing line terminator")
return
}
in.next()

/* Do a lookahead scan over the full text block to:
* - compute common white space prefix
* - find the offset where the text block ends
*/
var commonWhiteSpacePrefix = Int.MaxValue
var blockEndOffset = 0
val backtrackTo = in.copy
var blockClosed = false
var lineWhiteSpacePrefix = 0
var lineIsOnlyWhitespace = true
while (!blockClosed && (in.isUnicode || in.ch != SU)) {
if (in.ch == '\"') { // Potential end of the block
in.next()
if (in.ch == '\"') {
in.next()
if (in.ch == '\"') {
blockClosed = true
commonWhiteSpacePrefix = commonWhiteSpacePrefix min lineWhiteSpacePrefix
blockEndOffset = in.cpos - 2
}
}

// Not the end of the block - just a single or double " character
if (!blockClosed) {
lineIsOnlyWhitespace = false
}
} else if (in.ch == CR || in.ch == LF) { // new line in the block
in.next()
if (!lineIsOnlyWhitespace) {
commonWhiteSpacePrefix = commonWhiteSpacePrefix min lineWhiteSpacePrefix
}
lineWhiteSpacePrefix = 0
lineIsOnlyWhitespace = true
} else if (lineIsOnlyWhitespace && isWhitespace(in.ch)) { // extend white space prefix
in.next()
lineWhiteSpacePrefix += 1
} else {
lineIsOnlyWhitespace = false
getlitch()
}
}
setName() // clear the literal buffer

// Bail out if the block never did have an end
if (!blockClosed) {
syntaxError("unclosed text block")
return
}

// Second pass: construct the literal string value this time
in = backtrackTo
while (in.cpos < blockEndOffset) {
// Drop the line's leading whitespace
var remainingPrefix = commonWhiteSpacePrefix
while (remainingPrefix > 0 && in.ch != CR && in.ch != LF && in.cpos < blockEndOffset) {
in.next()
remainingPrefix -= 1
}

var trailingWhitespaceLength = 0
while (in.ch != CR && in.ch != LF && in.cpos < blockEndOffset) {
if (isWhitespace(in.ch)) {
trailingWhitespaceLength += 1
} else {
trailingWhitespaceLength = 0
}
getlitch()
}

// Drop the line's trailing whitespace
popNChars(trailingWhitespaceLength)

// Normalize line terminators
if (in.ch == CR || in.ch == LF) {
in.next()
putChar('\n')
}
}

token = STRINGLIT
setName()

// Trailing """
in.next()
in.next()
in.next()
}

/** read fractional part and exponent of floating point number
* if one is present.
*/
Expand Down
56 changes: 56 additions & 0 deletions test/files/run/t12290.check
@@ -0,0 +1,56 @@
====
A text

====
<html>
<body>
<p>Hello, world</p>
</body>
</html>

====
SELECT "EMP_ID", "LAST_NAME" FROM "EMPLOYEE_TB"
WHERE "CITY" = 'INDIANAPOLIS'
ORDER BY "EMP_ID", "LAST_NAME";

====
<html>
<body>
<p>Hello, world</p>
</body>
</html>

====
<html>
<body>
<p>Hello, world</p>
</body>
</html>

====
<html>
<body>
<p>Hello, world</p>
</body>

</html>

====
<html>

<body>
<p>Hello, world</p>
</body>
</html>

====
String text = """
A text block inside a text block
""";

====
foo bar
baz
====

====
27 changes: 27 additions & 0 deletions test/files/run/t12290/Test.scala
@@ -0,0 +1,27 @@
/* Using `valueOf` is a way to check that the Java string literals were properly
* parsed, since the parsed value is what the Scala compiler will use when
* resolving the singleton types
*/
object Test extends App {
println("====")
println(valueOf[TextBlocks.aText.type])
println("====")
println(valueOf[TextBlocks.html1.type])
println("====")
println(valueOf[TextBlocks.query.type])
println("====")
println(valueOf[TextBlocks.html2.type])
println("====")
println(valueOf[TextBlocks.html3.type])
println("====")
println(valueOf[TextBlocks.html4.type])
println("====")
println(valueOf[TextBlocks.html5.type])
println("====")
println(valueOf[TextBlocks.code.type])
println("====")
println(valueOf[TextBlocks.simpleString.type])
println("====")
println(valueOf[TextBlocks.emptyString.type])
println("====")
}
68 changes: 68 additions & 0 deletions test/files/run/t12290/TextBlocks.java
@@ -0,0 +1,68 @@
class TextBlocks {

final static String aText = """
A text
""";

final static String html1 = """
<html>
<body>
<p>Hello, world</p>
</body>
</html>
""";

// quote characters are unescaped
final static String query = """
SELECT "EMP_ID", "LAST_NAME" FROM "EMPLOYEE_TB"
WHERE "CITY" = 'INDIANAPOLIS'
ORDER BY "EMP_ID", "LAST_NAME";
""";

// incidental trailing spaces
final static String html2 = """
<html>
<body>
<p>Hello, world</p>
</body>
</html>
""";

// trailing delimiter influences
final static String html3 = """
<html>
<body>
<p>Hello, world</p>
</body>
</html>
""";

// blank line does not affect
final static String html4 = """
<html>
<body>
<p>Hello, world</p>
</body>
</html>
""";

// escape sequences
final static String html5 = """
<html>\n
<body>
<p>Hello,\tworld</p>
</body>
</html>
""";
final static String code =
"""
String text = \"""
A text block inside a text block
\""";
""";

final static String simpleString = "foo\tbar\nbaz";

final static String emptyString = "";
}

0 comments on commit af3e6fd

Please sign in to comment.