Skip to content

Commit

Permalink
Accept supplementary characters in identifiers
Browse files Browse the repository at this point in the history
  • Loading branch information
som-snytt committed Jul 8, 2021
1 parent ccd9576 commit 6c1bb5b
Show file tree
Hide file tree
Showing 16 changed files with 241 additions and 96 deletions.
1 change: 1 addition & 0 deletions build.sbt
Expand Up @@ -717,6 +717,7 @@ lazy val junit = project.in(file("test") / "junit")
"-feature",
"-Xlint:-valpattern,_",
"-Wconf:msg=match may not be exhaustive:s", // if we missed a case, all that happens is the test fails
"-Wconf:cat=lint-nullary-unit&site=.*Test:s", // normal unit test style
"-Ypatmat-exhaust-depth", "40", // despite not caring about patmat exhaustiveness, we still get warnings for this
),
Compile / javacOptions ++= Seq("-Xlint"),
Expand Down
2 changes: 1 addition & 1 deletion src/compiler/scala/tools/nsc/ast/parser/Parsers.scala
Expand Up @@ -264,7 +264,7 @@ self =>
if (syntaxErrors.isEmpty) firstTry
else in.healBraces() match {
case Nil => showSyntaxErrors() ; firstTry
case patches => (this withPatches patches).parse()
case patches => withPatches(patches).parse()
}
}
}
Expand Down
175 changes: 108 additions & 67 deletions src/compiler/scala/tools/nsc/ast/parser/Scanners.scala
Expand Up @@ -172,7 +172,34 @@ trait Scanners extends ScannersCommon {
/** A switch whether operators at the start of lines can be infix operators. */
private var allowLeadingInfixOperators = true

private def isDigit(c: Char) = java.lang.Character isDigit c
private def isDigit(c: Char) = Character.isDigit(c)

import Character.{isHighSurrogate, isLowSurrogate, isUnicodeIdentifierPart, isUnicodeIdentifierStart, isValidCodePoint, toCodePoint}

// given char (ch) is high surrogate followed by low, codepoint passes predicate.
// true means supplementary chars were put to buffer.
// strict to require low surrogate (if not in string literal).
private def isSupplementary(high: Char, test: Int => Boolean, strict: Boolean = true): Boolean =
isHighSurrogate(high) && {
var res = false
nextChar()
val low = ch
if (isLowSurrogate(low)) {
nextChar()
val codepoint = toCodePoint(high, low)
if (isValidCodePoint(codepoint) && test(codepoint)) {
putChar(high)
putChar(low)
res = true
} else
syntaxError(f"illegal character '\\u$high%04x\\u$low%04x'")
} else if (!strict) {
putChar(high)
res = true
} else
syntaxError(f"illegal character '\\u$high%04x' missing low surrogate")
res
}

private var openComments = 0
final protected def putCommentChar(): Unit = { processCommentChar(); nextChar() }
Expand Down Expand Up @@ -705,14 +732,18 @@ trait Scanners extends ScannersCommon {
syntaxError("empty character literal (use '\\'' for single quote)")
else {
nextChar()
token = CHARLIT
setStrVal()
if (cbuf.length != 1)
syntaxError("illegal codepoint in Char constant: " + cbuf.toString.map(c => f"\\u$c%04x").mkString("'", "", "'"))
else {
token = CHARLIT
setStrVal()
}
}
} else if (isEmptyCharLit) {
}
else if (isEmptyCharLit)
syntaxError("empty character literal")
} else {
else
unclosedCharLit()
}
}
else unclosedCharLit()
}
Expand Down Expand Up @@ -755,16 +786,18 @@ trait Scanners extends ScannersCommon {
} else if (ch == '\u2190') {
deprecationWarning("The unicode arrow `←` is deprecated, use `<-` instead. If you still wish to display it as one character, consider using a font with programming ligatures such as Fira Code.", "2.13.0")
nextChar(); token = LARROW
} else if (Character.isUnicodeIdentifierStart(ch)) {
} else if (isUnicodeIdentifierStart(ch)) {
putChar(ch)
nextChar()
getIdentRest()
} else if (isSpecial(ch)) {
putChar(ch)
nextChar()
getOperatorRest()
} else if (isSupplementary(ch, isUnicodeIdentifierStart)) {
getIdentRest()
} else {
syntaxError("illegal character '" + ("" + '\\' + 'u' + "%04x".format(ch.toInt)) + "'")
syntaxError(f"illegal character '\\u$ch%04x'")
nextChar()
}
}
Expand Down Expand Up @@ -831,13 +864,15 @@ trait Scanners extends ScannersCommon {
case SU => // strangely enough, Character.isUnicodeIdentifierPart(SU) returns true!
finishNamed()
case _ =>
if (Character.isUnicodeIdentifierPart(ch)) {
if (isUnicodeIdentifierPart(ch)) {
putChar(ch)
nextChar()
getIdentRest()
} else {
finishNamed()
}
else if (isSupplementary(ch, isUnicodeIdentifierPart))
getIdentRest()
else
finishNamed()
}

@tailrec
Expand Down Expand Up @@ -955,6 +990,19 @@ trait Scanners extends ScannersCommon {
}
getStringPart(multiLine, seenEscapedQuote || q)
} else if (ch == '$') {
@tailrec def getInterpolatedIdentRest(): Unit =
if (ch != SU && isUnicodeIdentifierPart(ch)) {
putChar(ch)
nextRawChar()
getInterpolatedIdentRest()
} else {
next.token = IDENTIFIER
next.name = newTermName(cbuf.toCharArray)
cbuf.clear()
val idx = next.name.start - kwOffset
if (idx >= 0 && idx < kwArray.length)
next.token = kwArray(idx)
}
nextRawChar()
if (ch == '$' || ch == '"') {
putChar(ch)
Expand All @@ -968,32 +1016,22 @@ trait Scanners extends ScannersCommon {
finishStringPart()
nextRawChar()
next.token = USCORE
} else if (Character.isUnicodeIdentifierStart(ch)) {
} else if (isUnicodeIdentifierStart(ch)) {
finishStringPart()
do {
putChar(ch)
nextRawChar()
} while (ch != SU && Character.isUnicodeIdentifierPart(ch))
next.token = IDENTIFIER
next.name = newTermName(cbuf.toString)
cbuf.clear()
val idx = next.name.start - kwOffset
if (idx >= 0 && idx < kwArray.length) {
next.token = kwArray(idx)
}
putChar(ch)
nextRawChar()
getInterpolatedIdentRest()
} else {
val expectations = "$$, $\", $identifier or ${expression}"
syntaxError(s"invalid string interpolation $$$ch, expected: $expectations")
}
} else {
val isUnclosedLiteral = (ch == SU || (!multiLine && (ch == CR || ch == LF)))
if (isUnclosedLiteral) {
if (isUnclosedLiteral)
if (multiLine)
incompleteInputError("unclosed multi-line string literal")
else {
else
unclosedStringLit(seenEscapedQuote)
}
}
else {
putChar(ch)
nextRawChar()
Expand Down Expand Up @@ -1027,53 +1065,38 @@ trait Scanners extends ScannersCommon {
false
}

/** copy current character into cbuf, interpreting any escape sequences,
* and advance to next character.
/** Copy current character into cbuf, interpreting any escape sequences,
* and advance to next character. Surrogate pairs are consumed (see check
* at fetchSingleQuote), but orphan surrogate is allowed.
*/
protected def getLitChar(): Unit =
if (ch == '\\') {
nextChar()
if ('0' <= ch && ch <= '7') {
val start = charOffset - 2
val leadch: Char = ch
var oct: Int = digit2int(ch, 8)
nextChar()
if ('0' <= ch && ch <= '7') {
oct = oct * 8 + digit2int(ch, 8)
nextChar()
if (leadch <= '3' && '0' <= ch && ch <= '7') {
oct = oct * 8 + digit2int(ch, 8)
nextChar()
}
}
val alt = if (oct == LF) "\\n" else "\\u%04x" format oct
syntaxError(start, s"octal escape literals are unsupported: use $alt instead")
putChar(oct.toChar)
} else {
if (ch == 'u') {
if (getUEscape()) nextChar()
}
else {
ch match {
case 'b' => putChar('\b')
case 't' => putChar('\t')
case 'n' => putChar('\n')
case 'f' => putChar('\f')
case 'r' => putChar('\r')
case '\"' => putChar('\"')
case '\'' => putChar('\'')
case '\\' => putChar('\\')
case _ => invalidEscape()
}
nextChar()
}
}
} else {
charEscape()
} else if (!isSupplementary(ch, _ => true, strict = false)) {
putChar(ch)
nextChar()
}

private def getUEscape(): Boolean = {
private def charEscape(): Unit = {
var bump = true
ch match {
case 'b' => putChar('\b')
case 't' => putChar('\t')
case 'n' => putChar('\n')
case 'f' => putChar('\f')
case 'r' => putChar('\r')
case '\"' => putChar('\"')
case '\'' => putChar('\'')
case '\\' => putChar('\\')
case 'u' => bump = uEscape()
case x if '0' <= x && x <= '7' => bump = octalEscape()
case _ => invalidEscape()
}
if (bump) nextChar()
}

private def uEscape(): Boolean = {
while (ch == 'u') nextChar()
var codepoint = 0
var digitsRead = 0
Expand All @@ -1094,7 +1117,25 @@ trait Scanners extends ScannersCommon {
putChar(found)
true
}


private def octalEscape(): Boolean = {
val start = charOffset - 2
val leadch: Char = ch
var oct: Int = digit2int(ch, 8)
nextChar()
if ('0' <= ch && ch <= '7') {
oct = oct * 8 + digit2int(ch, 8)
nextChar()
if (leadch <= '3' && '0' <= ch && ch <= '7') {
oct = oct * 8 + digit2int(ch, 8)
nextChar()
}
}
val alt = if (oct == LF) "\\n" else f"\\u$oct%04x"
syntaxError(start, s"octal escape literals are unsupported: use $alt instead")
putChar(oct.toChar)
false
}

protected def invalidEscape(): Unit = {
syntaxError(charOffset - 1, "invalid escape character")
Expand Down
Expand Up @@ -27,9 +27,7 @@ import scala.tools.nsc.io.AbstractFile
*/
final class AbstractFileReader(val buf: Array[Byte]) extends DataReader {
@deprecated("Use other constructor", "2.13.0")
def this(file: AbstractFile) = {
this(file.toByteArray)
}
def this(file: AbstractFile) = this(file.toByteArray)

/** the current input pointer
*/
Expand Down Expand Up @@ -67,9 +65,8 @@ final class AbstractFileReader(val buf: Array[Byte]) extends DataReader {
def getByte(mybp: Int): Byte =
buf(mybp)

def getBytes(mybp: Int, bytes: Array[Byte]): Unit = {
def getBytes(mybp: Int, bytes: Array[Byte]): Unit =
System.arraycopy(buf, mybp, bytes, 0, bytes.length)
}

/** extract a character at position bp from buf
*/
Expand All @@ -95,9 +92,8 @@ final class AbstractFileReader(val buf: Array[Byte]) extends DataReader {
*/
def getDouble(mybp: Int): Double = longBitsToDouble(getLong(mybp))

def getUTF(mybp: Int, len: Int): String = {
def getUTF(mybp: Int, len: Int): String =
new DataInputStream(new ByteArrayInputStream(buf, mybp, len)).readUTF
}

/** skip next 'n' bytes
*/
Expand Down
1 change: 1 addition & 0 deletions src/partest/scala/tools/partest/DirectTest.scala
Expand Up @@ -45,6 +45,7 @@ abstract class DirectTest {
protected def pathOf(locations: String*) = locations.mkString(sys.props("path.separator"))

// override to add additional settings besides -d testOutput.path
// default is -usejavacp
def extraSettings: String = "-usejavacp"
// a default Settings object using only extraSettings
def settings: Settings = newSettings(CommandLineParser.tokenize(extraSettings))
Expand Down
14 changes: 0 additions & 14 deletions src/partest/scala/tools/partest/package.scala
Expand Up @@ -19,7 +19,6 @@ import scala.concurrent.duration.Duration
import scala.io.Codec
import scala.jdk.CollectionConverters._
import scala.tools.nsc.util.Exceptional
import scala.util.chaining._

package object partest {
type File = java.io.File
Expand Down Expand Up @@ -180,17 +179,4 @@ package object partest {
def isDebug = sys.props.contains("partest.debug") || sys.env.contains("PARTEST_DEBUG")
def debugSettings = sys.props.getOrElse("partest.debug.settings", "")
def log(msg: => Any): Unit = if (isDebug) Console.err.println(msg)

private val printable = raw"\p{Print}".r

def hexdump(s: String): Iterator[String] = {
var offset = 0
def hex(bytes: Array[Byte]) = bytes.map(b => f"$b%02x").mkString(" ")
def charFor(byte: Byte): Char = byte.toChar match { case c @ printable() => c ; case _ => '.' }
def ascii(bytes: Array[Byte]) = bytes.map(charFor).mkString
def format(bytes: Array[Byte]): String =
f"$offset%08x ${hex(bytes.slice(0, 8))}%-24s ${hex(bytes.slice(8, 16))}%-24s |${ascii(bytes)}|"
.tap(_ => offset += bytes.length)
s.getBytes(codec.charSet).grouped(16).map(format)
}
}
19 changes: 19 additions & 0 deletions src/testkit/scala/tools/testkit/AssertUtil.scala
Expand Up @@ -51,6 +51,25 @@ object AssertUtil {
// junit fail is Unit
def fail(message: String): Nothing = throw new AssertionError(message)

private val printable = raw"\p{Print}".r

def hexdump(s: String): Iterator[String] = {
import scala.io.Codec
val codec: Codec = Codec.UTF8
var offset = 0
def hex(bytes: Array[Byte]) = bytes.map(b => f"$b%02x").mkString(" ")
def charFor(byte: Byte): Char = byte.toChar match { case c @ printable() => c ; case _ => '.' }
def ascii(bytes: Array[Byte]) = bytes.map(charFor).mkString
def format(bytes: Array[Byte]): String =
f"$offset%08x ${hex(bytes.slice(0, 8))}%-24s ${hex(bytes.slice(8, 16))}%-24s |${ascii(bytes)}|"
.tap(_ => offset += bytes.length)
s.getBytes(codec.charSet).grouped(16).map(format)
}

private def dump(s: String) = hexdump(s).mkString("\n")
def assertEqualStrings(expected: String)(actual: String) =
assert(expected == actual, s"Expected:\n${dump(expected)}\nActual:\n${dump(actual)}")

private final val timeout = 60 * 1000L // wait a minute

private implicit class `ref helper`[A](val r: Reference[A]) extends AnyVal {
Expand Down
10 changes: 10 additions & 0 deletions test/files/neg/surrogates.check
@@ -0,0 +1,10 @@
surrogates.scala:3: error: illegal codepoint in Char constant: '\ud801\udc00'
def `too wide for Char` = '𐐀'
^
surrogates.scala:4: error: invalid string interpolation $?, expected: $$, $", $identifier or ${expression}
def `alpha required to start` = s"$𐐀"
^
surrogates.scala:4: error: unclosed string literal
def `alpha required to start` = s"$𐐀"
^
3 errors
5 changes: 5 additions & 0 deletions test/files/neg/surrogates.scala
@@ -0,0 +1,5 @@

class C {
def `too wide for Char` = '𐐀'
def `alpha required to start` = s"$𐐀"
}

0 comments on commit 6c1bb5b

Please sign in to comment.