Skip to content

Commit

Permalink
Accept supplementary characters
Browse files Browse the repository at this point in the history
  • Loading branch information
som-snytt committed Jul 4, 2021
1 parent f611f6c commit 74de6d5
Show file tree
Hide file tree
Showing 11 changed files with 162 additions and 44 deletions.
1 change: 1 addition & 0 deletions build.sbt
Expand Up @@ -717,6 +717,7 @@ lazy val junit = project.in(file("test") / "junit")
"-feature",
"-Xlint:-valpattern,_",
"-Wconf:msg=match may not be exhaustive:s", // if we missed a case, all that happens is the test fails
"-Wconf:cat=lint-nullary-unit:s", // normal unit test style
"-Ypatmat-exhaust-depth", "40", // despite not caring about patmat exhaustiveness, we still get warnings for this
),
Compile / javacOptions ++= Seq("-Xlint"),
Expand Down
120 changes: 103 additions & 17 deletions src/compiler/scala/tools/nsc/ast/parser/Scanners.scala
Expand Up @@ -700,7 +700,10 @@ trait Scanners extends ScannersCommon {
else if (!isAtEnd && (ch != SU && ch != CR && ch != LF)) {
val isEmptyCharLit = (ch == '\'')
getLitChar()
if (ch == '\'') {
if (Character.isHighSurrogate(cbuf.charAt(0))) {
syntaxError("illegal codepoint in Char constant")
if (ch == '\'') nextChar()
} else if (ch == '\'') {
if (isEmptyCharLit)
syntaxError("empty character literal (use '\\'' for single quote)")
else {
Expand Down Expand Up @@ -749,22 +752,40 @@ trait Scanners extends ScannersCommon {
}
case _ =>
def fetchOther() = {
import Character.{isHighSurrogate, isLowSurrogate, isUnicodeIdentifierStart, isValidCodePoint, toCodePoint}
if (ch == '\u21D2') {
deprecationWarning("The unicode arrow `⇒` is deprecated, use `=>` instead. If you still wish to display it as one character, consider using a font with programming ligatures such as Fira Code.", "2.13.0")
nextChar(); token = ARROW
} else if (ch == '\u2190') {
deprecationWarning("The unicode arrow `←` is deprecated, use `<-` instead. If you still wish to display it as one character, consider using a font with programming ligatures such as Fira Code.", "2.13.0")
nextChar(); token = LARROW
} else if (Character.isUnicodeIdentifierStart(ch)) {
} else if (isUnicodeIdentifierStart(ch)) {
putChar(ch)
nextChar()
getIdentRest()
} else if (isHighSurrogate(ch)) {
val high = ch
nextChar()
if (isLowSurrogate(ch)) {
val low = ch
nextChar()
val codepoint = toCodePoint(high, low)
if (isValidCodePoint(codepoint) && isUnicodeIdentifierStart(codepoint)) {
putChar(high)
putChar(low)
getIdentRest()
} else {
syntaxError(f"illegal character '\\u${high.toInt}%04x\\u${low.toInt}%04x'")
}
} else {
syntaxError(f"illegal character '\\u${high.toInt}%04x' missing low surrogate")
}
} else if (isSpecial(ch)) {
putChar(ch)
nextChar()
getOperatorRest()
} else {
syntaxError("illegal character '" + ("" + '\\' + 'u' + "%04x".format(ch.toInt)) + "'")
syntaxError(f"illegal character '\\u${ch.toInt}%04x'")
nextChar()
}
}
Expand Down Expand Up @@ -831,10 +852,28 @@ trait Scanners extends ScannersCommon {
case SU => // strangely enough, Character.isUnicodeIdentifierPart(SU) returns true!
finishNamed()
case _ =>
if (Character.isUnicodeIdentifierPart(ch)) {
import Character.{isHighSurrogate, isLowSurrogate, isUnicodeIdentifierPart, isValidCodePoint, toCodePoint}
if (isUnicodeIdentifierPart(ch)) {
putChar(ch)
nextChar()
getIdentRest()
} else if (isHighSurrogate(ch)) {
val high = ch
nextChar()
if (isLowSurrogate(ch)) {
val low = ch
nextChar()
val codepoint = toCodePoint(high, low)
if (isValidCodePoint(codepoint) && isUnicodeIdentifierPart(codepoint)) {
putChar(high)
putChar(low)
getIdentRest()
} else {
syntaxError(f"illegal character '\\u${high.toInt}%04x\\u${low.toInt}%04x'")
}
} else {
syntaxError(f"illegal character '\\u${high.toInt}%04x' missing low surrogate")
}
} else {
finishNamed()
}
Expand Down Expand Up @@ -955,6 +994,38 @@ trait Scanners extends ScannersCommon {
}
getStringPart(multiLine, seenEscapedQuote || q)
} else if (ch == '$') {
import Character.{isHighSurrogate, isLowSurrogate, isUnicodeIdentifierPart, isUnicodeIdentifierStart, isValidCodePoint, toCodePoint}
def isUnicodeSurrogate(ch: Char, f: Int => Boolean): Boolean =
isHighSurrogate(ch) && {
val hi = ch
val r = lookaheadReader
r.nextRawChar()
val lo = r.ch
isLowSurrogate(lo) && {
val codepoint = toCodePoint(hi, lo)
isValidCodePoint(codepoint) && f(codepoint)
}
}
@tailrec def getInterpolatedIdentRest(): Unit =
if (ch != SU && isUnicodeIdentifierPart(ch)) {
putChar(ch)
nextRawChar()
getInterpolatedIdentRest()
} else if (isUnicodeSurrogate(ch, isUnicodeIdentifierPart)) {
putChar(ch)
nextRawChar()
putChar(ch)
nextRawChar()
getInterpolatedIdentRest()
} else {
next.token = IDENTIFIER
next.name = newTermName(cbuf.toString)
cbuf.clear()
val idx = next.name.start - kwOffset
if (idx >= 0 && idx < kwArray.length) {
next.token = kwArray(idx)
}
}
nextRawChar()
if (ch == '$' || ch == '"') {
putChar(ch)
Expand All @@ -968,19 +1039,18 @@ trait Scanners extends ScannersCommon {
finishStringPart()
nextRawChar()
next.token = USCORE
} else if (Character.isUnicodeIdentifierStart(ch)) {
} else if (isUnicodeIdentifierStart(ch)) {
finishStringPart()
do {
putChar(ch)
nextRawChar()
} while (ch != SU && Character.isUnicodeIdentifierPart(ch))
next.token = IDENTIFIER
next.name = newTermName(cbuf.toString)
cbuf.clear()
val idx = next.name.start - kwOffset
if (idx >= 0 && idx < kwArray.length) {
next.token = kwArray(idx)
}
putChar(ch)
nextRawChar()
getInterpolatedIdentRest()
} else if (isUnicodeSurrogate(ch, isUnicodeIdentifierStart)) {
finishStringPart()
putChar(ch)
nextRawChar()
putChar(ch)
nextRawChar()
getInterpolatedIdentRest()
} else {
val expectations = "$$, $\", $identifier or ${expression}"
syntaxError(s"invalid string interpolation $$$ch, expected: $expectations")
Expand Down Expand Up @@ -1068,7 +1138,23 @@ trait Scanners extends ScannersCommon {
nextChar()
}
}
} else {
} else if (Character.isHighSurrogate(ch)) {
val high = ch
nextChar()
if (Character.isLowSurrogate(ch)) {
val low = ch
nextChar()
val codepoint = Character.toCodePoint(high, low)
if (Character.isValidCodePoint(codepoint)) {
putChar(high)
putChar(low)
} else {
syntaxError(f"illegal character '\\u${high.toInt}%04x\\u${low.toInt}%04x'")
}
} else {
syntaxError(f"illegal character '\\u${high.toInt}%04x' missing low surrogate")
}
} else {
putChar(ch)
nextChar()
}
Expand Down
Expand Up @@ -27,9 +27,7 @@ import scala.tools.nsc.io.AbstractFile
*/
final class AbstractFileReader(val buf: Array[Byte]) extends DataReader {
@deprecated("Use other constructor", "2.13.0")
def this(file: AbstractFile) = {
this(file.toByteArray)
}
def this(file: AbstractFile) = this(file.toByteArray)

/** the current input pointer
*/
Expand Down Expand Up @@ -67,9 +65,8 @@ final class AbstractFileReader(val buf: Array[Byte]) extends DataReader {
def getByte(mybp: Int): Byte =
buf(mybp)

def getBytes(mybp: Int, bytes: Array[Byte]): Unit = {
def getBytes(mybp: Int, bytes: Array[Byte]): Unit =
System.arraycopy(buf, mybp, bytes, 0, bytes.length)
}

/** extract a character at position bp from buf
*/
Expand All @@ -95,9 +92,8 @@ final class AbstractFileReader(val buf: Array[Byte]) extends DataReader {
*/
def getDouble(mybp: Int): Double = longBitsToDouble(getLong(mybp))

def getUTF(mybp: Int, len: Int): String = {
def getUTF(mybp: Int, len: Int): String =
new DataInputStream(new ByteArrayInputStream(buf, mybp, len)).readUTF
}

/** skip next 'n' bytes
*/
Expand Down
13 changes: 0 additions & 13 deletions src/partest/scala/tools/partest/package.scala
Expand Up @@ -180,17 +180,4 @@ package object partest {
def isDebug = sys.props.contains("partest.debug") || sys.env.contains("PARTEST_DEBUG")
def debugSettings = sys.props.getOrElse("partest.debug.settings", "")
def log(msg: => Any): Unit = if (isDebug) Console.err.println(msg)

private val printable = raw"\p{Print}".r

def hexdump(s: String): Iterator[String] = {
var offset = 0
def hex(bytes: Array[Byte]) = bytes.map(b => f"$b%02x").mkString(" ")
def charFor(byte: Byte): Char = byte.toChar match { case c @ printable() => c ; case _ => '.' }
def ascii(bytes: Array[Byte]) = bytes.map(charFor).mkString
def format(bytes: Array[Byte]): String =
f"$offset%08x ${hex(bytes.slice(0, 8))}%-24s ${hex(bytes.slice(8, 16))}%-24s |${ascii(bytes)}|"
.tap(_ => offset += bytes.length)
s.getBytes(codec.charSet).grouped(16).map(format)
}
}
19 changes: 19 additions & 0 deletions src/testkit/scala/tools/testkit/AssertUtil.scala
Expand Up @@ -51,6 +51,25 @@ object AssertUtil {
// junit fail is Unit
def fail(message: String): Nothing = throw new AssertionError(message)

private val printable = raw"\p{Print}".r

def hexdump(s: String): Iterator[String] = {
import scala.io.Codec
val codec: Codec = Codec.UTF8
var offset = 0
def hex(bytes: Array[Byte]) = bytes.map(b => f"$b%02x").mkString(" ")
def charFor(byte: Byte): Char = byte.toChar match { case c @ printable() => c ; case _ => '.' }
def ascii(bytes: Array[Byte]) = bytes.map(charFor).mkString
def format(bytes: Array[Byte]): String =
f"$offset%08x ${hex(bytes.slice(0, 8))}%-24s ${hex(bytes.slice(8, 16))}%-24s |${ascii(bytes)}|"
.tap(_ => offset += bytes.length)
s.getBytes(codec.charSet).grouped(16).map(format)
}

private def dump(s: String) = hexdump(s).mkString("\n")
def assertEqualStrings(expected: String)(actual: String) =
assert(expected == actual, s"Expected:\n${dump(expected)}\nActual:\n${dump(actual)}")

private final val timeout = 60 * 1000L // wait a minute

private implicit class `ref helper`[A](val r: Reference[A]) extends AnyVal {
Expand Down
4 changes: 4 additions & 0 deletions test/files/neg/surrogates.check
@@ -0,0 +1,4 @@
surrogates.scala:3: error: illegal codepoint in Char constant
def c = '𐐀'
^
1 error
4 changes: 4 additions & 0 deletions test/files/neg/surrogates.scala
@@ -0,0 +1,4 @@

class C {
def c = '𐐀'
}
12 changes: 12 additions & 0 deletions test/files/pos/surrogates.scala
@@ -0,0 +1,12 @@

class 𐐀 {
def 𐐀 = 42
def x = "𐐀"
def y = s"$𐐀"
}

case class 𐐀𐐀(n: Int) {
def 𐐀𐐀 = n
}

// was: error: illegal character '\ud801', '\udc00'
3 changes: 2 additions & 1 deletion test/files/run/t12276.scala
@@ -1,6 +1,7 @@
import scala.tools.nsc.Settings
import scala.tools.nsc.interpreter.shell.{ILoop, ShellConfig}
import scala.tools.partest.{hexdump, ReplTest}
import scala.tools.partest.ReplTest
import scala.tools.testkit.AssertUtil.hexdump

object Test extends ReplTest {
def code = s"""
Expand Down
14 changes: 8 additions & 6 deletions test/files/run/t9915/Test_2.scala
@@ -1,12 +1,14 @@

import scala.tools.testkit.AssertUtil.assertEqualStrings

object Test extends App {
val c = new C_1
assert(c.nulled == "X\u0000ABC") // "X\000ABC"
assert(c.supped == "𐒈𐒝𐒑𐒛𐒐𐒘𐒕𐒖")
assert(C_1.NULLED.length == "XYABC".length)
assert(C_1.SUPPED.codePointCount(0, C_1.SUPPED.length) == 8)

assert(C_1.NULLED == "X\u0000ABC") // "X\000ABC"
assert(C_1.SUPPED == "𐒈𐒝𐒑𐒛𐒐𐒘𐒕𐒖")
assertEqualStrings(c.nulled)("X\u0000ABC") // "X\000ABC" in java source
assertEqualStrings(c.supped)("𐒈𐒝𐒑𐒛𐒐𐒘𐒕𐒖")

assert(C_1.NULLED.size == "XYABC".size)
assert(C_1.SUPPED.codePointCount(0, C_1.SUPPED.length) == 8)
assertEqualStrings(C_1.NULLED)("X\u0000ABC") // "X\000ABC" in java source
assertEqualStrings(C_1.SUPPED)("𐒈𐒝𐒑𐒛𐒐𐒘𐒕𐒖")
}
6 changes: 6 additions & 0 deletions test/junit/scala/tools/testkit/AssertUtilTest.scala
Expand Up @@ -110,4 +110,10 @@ class AssertUtilTest {
assertEquals(1, sut.errors.size)
assertEquals(0, sut.errors.head._2.getSuppressed.length)
}

/** TODO
@Test def `hexdump is supplementary-aware`: Unit = {
assertEquals("00000000 f0 90 90 80 |𐐀.|", hexdump("\ud801\udc00").next())
}
*/
}

0 comments on commit 74de6d5

Please sign in to comment.