Accept supplementary characters in identifiers

scala · Jul 8, 2021 · 6c1bb5b · 6c1bb5b
1 parent ccd9576
commit 6c1bb5b
Show file tree

Hide file tree

Showing 16 changed files with 241 additions and 96 deletions.
diff --git a/build.sbt b/build.sbt
@@ -717,6 +717,7 @@ lazy val junit = project.in(file("test") / "junit")
       "-feature",
       "-Xlint:-valpattern,_",
       "-Wconf:msg=match may not be exhaustive:s", // if we missed a case, all that happens is the test fails
+      "-Wconf:cat=lint-nullary-unit&site=.*Test:s", // normal unit test style
       "-Ypatmat-exhaust-depth", "40", // despite not caring about patmat exhaustiveness, we still get warnings for this
     ),
     Compile / javacOptions ++= Seq("-Xlint"),

diff --git a/src/compiler/scala/tools/nsc/ast/parser/Parsers.scala b/src/compiler/scala/tools/nsc/ast/parser/Parsers.scala
@@ -264,7 +264,7 @@ self =>
       if (syntaxErrors.isEmpty) firstTry
       else in.healBraces() match {
         case Nil      => showSyntaxErrors() ; firstTry
-        case patches  => (this withPatches patches).parse()
+        case patches  => withPatches(patches).parse()
       }
     }
   }

diff --git a/src/compiler/scala/tools/nsc/ast/parser/Scanners.scala b/src/compiler/scala/tools/nsc/ast/parser/Scanners.scala
@@ -172,7 +172,34 @@ trait Scanners extends ScannersCommon {
     /** A switch whether operators at the start of lines can be infix operators. */
     private var allowLeadingInfixOperators = true
 
-    private def isDigit(c: Char) = java.lang.Character isDigit c
+    private def isDigit(c: Char) = Character.isDigit(c)
+
+    import Character.{isHighSurrogate, isLowSurrogate, isUnicodeIdentifierPart, isUnicodeIdentifierStart, isValidCodePoint, toCodePoint}
+
+    // given char (ch) is high surrogate followed by low, codepoint passes predicate.
+    // true means supplementary chars were put to buffer.
+    // strict to require low surrogate (if not in string literal).
+    private def isSupplementary(high: Char, test: Int => Boolean, strict: Boolean = true): Boolean =
+      isHighSurrogate(high) && {
+        var res = false
+        nextChar()
+        val low = ch
+        if (isLowSurrogate(low)) {
+          nextChar()
+          val codepoint = toCodePoint(high, low)
+          if (isValidCodePoint(codepoint) && test(codepoint)) {
+            putChar(high)
+            putChar(low)
+            res = true
+          } else
+            syntaxError(f"illegal character '\\u$high%04x\\u$low%04x'")
+        } else if (!strict) {
+          putChar(high)
+          res = true
+        } else
+          syntaxError(f"illegal character '\\u$high%04x' missing low surrogate")
+        res
+      }
 
     private var openComments = 0
     final protected def putCommentChar(): Unit = { processCommentChar(); nextChar() }
@@ -705,14 +732,18 @@ trait Scanners extends ScannersCommon {
                   syntaxError("empty character literal (use '\\'' for single quote)")
                 else {
                   nextChar()
-                  token = CHARLIT
-                  setStrVal()
+                  if (cbuf.length != 1)
+                    syntaxError("illegal codepoint in Char constant: " + cbuf.toString.map(c => f"\\u$c%04x").mkString("'", "", "'"))
+                  else {
+                    token = CHARLIT
+                    setStrVal()
+                  }
                 }
-              } else if (isEmptyCharLit) {
+              }
+              else if (isEmptyCharLit)
                 syntaxError("empty character literal")
-              } else {
+              else
                 unclosedCharLit()
-              }
             }
             else unclosedCharLit()
           }
@@ -755,16 +786,18 @@ trait Scanners extends ScannersCommon {
             } else if (ch == '\u2190') {
               deprecationWarning("The unicode arrow `←` is deprecated, use `<-` instead. If you still wish to display it as one character, consider using a font with programming ligatures such as Fira Code.", "2.13.0")
               nextChar(); token = LARROW
-            } else if (Character.isUnicodeIdentifierStart(ch)) {
+            } else if (isUnicodeIdentifierStart(ch)) {
               putChar(ch)
               nextChar()
               getIdentRest()
             } else if (isSpecial(ch)) {
               putChar(ch)
               nextChar()
               getOperatorRest()
+            } else if (isSupplementary(ch, isUnicodeIdentifierStart)) {
+              getIdentRest()
             } else {
-              syntaxError("illegal character '" + ("" + '\\' + 'u' + "%04x".format(ch.toInt)) + "'")
+              syntaxError(f"illegal character '\\u$ch%04x'")
               nextChar()
             }
           }
@@ -831,13 +864,15 @@ trait Scanners extends ScannersCommon {
       case SU => // strangely enough, Character.isUnicodeIdentifierPart(SU) returns true!
         finishNamed()
       case _ =>
-        if (Character.isUnicodeIdentifierPart(ch)) {
+        if (isUnicodeIdentifierPart(ch)) {
           putChar(ch)
           nextChar()
           getIdentRest()
-        } else {
-          finishNamed()
         }
+        else if (isSupplementary(ch, isUnicodeIdentifierPart))
+          getIdentRest()
+        else
+          finishNamed()
     }
 
     @tailrec
@@ -955,6 +990,19 @@ trait Scanners extends ScannersCommon {
         }
         getStringPart(multiLine, seenEscapedQuote || q)
       } else if (ch == '$') {
+        @tailrec def getInterpolatedIdentRest(): Unit =
+          if (ch != SU && isUnicodeIdentifierPart(ch)) {
+            putChar(ch)
+            nextRawChar()
+            getInterpolatedIdentRest()
+          } else {
+            next.token = IDENTIFIER
+            next.name = newTermName(cbuf.toCharArray)
+            cbuf.clear()
+            val idx = next.name.start - kwOffset
+            if (idx >= 0 && idx < kwArray.length)
+              next.token = kwArray(idx)
+          }
         nextRawChar()
         if (ch == '$' || ch == '"') {
           putChar(ch)
@@ -968,32 +1016,22 @@ trait Scanners extends ScannersCommon {
           finishStringPart()
           nextRawChar()
           next.token = USCORE
-        } else if (Character.isUnicodeIdentifierStart(ch)) {
+        } else if (isUnicodeIdentifierStart(ch)) {
           finishStringPart()
-          do {
-            putChar(ch)
-            nextRawChar()
-          } while (ch != SU && Character.isUnicodeIdentifierPart(ch))
-          next.token = IDENTIFIER
-          next.name = newTermName(cbuf.toString)
-          cbuf.clear()
-          val idx = next.name.start - kwOffset
-          if (idx >= 0 && idx < kwArray.length) {
-            next.token = kwArray(idx)
-          }
+          putChar(ch)
+          nextRawChar()
+          getInterpolatedIdentRest()
         } else {
           val expectations = "$$, $\", $identifier or ${expression}"
           syntaxError(s"invalid string interpolation $$$ch, expected: $expectations")
         }
       } else {
         val isUnclosedLiteral = (ch == SU || (!multiLine && (ch == CR || ch == LF)))
-        if (isUnclosedLiteral) {
+        if (isUnclosedLiteral)
           if (multiLine)
             incompleteInputError("unclosed multi-line string literal")
-          else {
+          else
             unclosedStringLit(seenEscapedQuote)
-          }
-        }
         else {
           putChar(ch)
           nextRawChar()
@@ -1027,53 +1065,38 @@ trait Scanners extends ScannersCommon {
         false
       }
 
-    /** copy current character into cbuf, interpreting any escape sequences,
-     *  and advance to next character.
+    /** Copy current character into cbuf, interpreting any escape sequences,
+     *  and advance to next character. Surrogate pairs are consumed (see check
+     *  at fetchSingleQuote), but orphan surrogate is allowed.
      */
     protected def getLitChar(): Unit =
       if (ch == '\\') {
         nextChar()
-        if ('0' <= ch && ch <= '7') {
-          val start = charOffset - 2
-          val leadch: Char = ch
-          var oct: Int = digit2int(ch, 8)
-          nextChar()
-          if ('0' <= ch && ch <= '7') {
-            oct = oct * 8 + digit2int(ch, 8)
-            nextChar()
-            if (leadch <= '3' && '0' <= ch && ch <= '7') {
-              oct = oct * 8 + digit2int(ch, 8)
-              nextChar()
-            }
-          }
-          val alt = if (oct == LF) "\\n" else "\\u%04x" format oct
-          syntaxError(start, s"octal escape literals are unsupported: use $alt instead")
-          putChar(oct.toChar)
-        } else {
-          if (ch == 'u') {
-            if (getUEscape()) nextChar()
-          }
-          else {
-            ch match {
-              case 'b'  => putChar('\b')
-              case 't'  => putChar('\t')
-              case 'n'  => putChar('\n')
-              case 'f'  => putChar('\f')
-              case 'r'  => putChar('\r')
-              case '\"' => putChar('\"')
-              case '\'' => putChar('\'')
-              case '\\' => putChar('\\')
-              case _    => invalidEscape()
-            }
-            nextChar()
-          }
-        }
-      } else  {
+        charEscape()
+      } else if (!isSupplementary(ch, _ => true, strict = false)) {
         putChar(ch)
         nextChar()
       }
 
-    private def getUEscape(): Boolean = {
+    private def charEscape(): Unit = {
+      var bump = true
+      ch match {
+        case 'b'  => putChar('\b')
+        case 't'  => putChar('\t')
+        case 'n'  => putChar('\n')
+        case 'f'  => putChar('\f')
+        case 'r'  => putChar('\r')
+        case '\"' => putChar('\"')
+        case '\'' => putChar('\'')
+        case '\\' => putChar('\\')
+        case 'u'  => bump = uEscape()
+        case x if '0' <= x && x <= '7' => bump = octalEscape()
+        case _    => invalidEscape()
+      }
+      if (bump) nextChar()
+    }
+
+    private def uEscape(): Boolean = {
       while (ch == 'u') nextChar()
       var codepoint = 0
       var digitsRead = 0
@@ -1094,7 +1117,25 @@ trait Scanners extends ScannersCommon {
       putChar(found)
       true
     }
-
+
+    private def octalEscape(): Boolean = {
+      val start = charOffset - 2
+      val leadch: Char = ch
+      var oct: Int = digit2int(ch, 8)
+      nextChar()
+      if ('0' <= ch && ch <= '7') {
+        oct = oct * 8 + digit2int(ch, 8)
+        nextChar()
+        if (leadch <= '3' && '0' <= ch && ch <= '7') {
+          oct = oct * 8 + digit2int(ch, 8)
+          nextChar()
+        }
+      }
+      val alt = if (oct == LF) "\\n" else f"\\u$oct%04x"
+      syntaxError(start, s"octal escape literals are unsupported: use $alt instead")
+      putChar(oct.toChar)
+      false
+    }
 
     protected def invalidEscape(): Unit = {
       syntaxError(charOffset - 1, "invalid escape character")

diff --git a/src/compiler/scala/tools/nsc/symtab/classfile/AbstractFileReader.scala b/src/compiler/scala/tools/nsc/symtab/classfile/AbstractFileReader.scala
@@ -27,9 +27,7 @@ import scala.tools.nsc.io.AbstractFile
  */
 final class AbstractFileReader(val buf: Array[Byte]) extends DataReader {
   @deprecated("Use other constructor", "2.13.0")
-  def this(file: AbstractFile) = {
-    this(file.toByteArray)
-  }
+  def this(file: AbstractFile) = this(file.toByteArray)
 
   /** the current input pointer
    */
@@ -67,9 +65,8 @@ final class AbstractFileReader(val buf: Array[Byte]) extends DataReader {
   def getByte(mybp: Int): Byte =
     buf(mybp)
 
-  def getBytes(mybp: Int, bytes: Array[Byte]): Unit = {
+  def getBytes(mybp: Int, bytes: Array[Byte]): Unit =
     System.arraycopy(buf, mybp, bytes, 0, bytes.length)
-  }
 
   /** extract a character at position bp from buf
    */
@@ -95,9 +92,8 @@ final class AbstractFileReader(val buf: Array[Byte]) extends DataReader {
    */
   def getDouble(mybp: Int): Double = longBitsToDouble(getLong(mybp))
 
-  def getUTF(mybp: Int, len: Int): String = {
+  def getUTF(mybp: Int, len: Int): String =
     new DataInputStream(new ByteArrayInputStream(buf, mybp, len)).readUTF
-  }
 
   /** skip next 'n' bytes
    */

diff --git a/src/partest/scala/tools/partest/DirectTest.scala b/src/partest/scala/tools/partest/DirectTest.scala
@@ -45,6 +45,7 @@ abstract class DirectTest {
   protected def pathOf(locations: String*) = locations.mkString(sys.props("path.separator"))
 
   // override to add additional settings besides -d testOutput.path
+  // default is -usejavacp
   def extraSettings: String = "-usejavacp"
   // a default Settings object using only extraSettings
   def settings: Settings = newSettings(CommandLineParser.tokenize(extraSettings))

diff --git a/src/partest/scala/tools/partest/package.scala b/src/partest/scala/tools/partest/package.scala
@@ -19,7 +19,6 @@ import scala.concurrent.duration.Duration
 import scala.io.Codec
 import scala.jdk.CollectionConverters._
 import scala.tools.nsc.util.Exceptional
-import scala.util.chaining._
 
 package object partest {
   type File         = java.io.File
@@ -180,17 +179,4 @@ package object partest {
   def isDebug                = sys.props.contains("partest.debug") || sys.env.contains("PARTEST_DEBUG")
   def debugSettings          = sys.props.getOrElse("partest.debug.settings", "")
   def log(msg: => Any): Unit = if (isDebug) Console.err.println(msg)
-
-  private val printable = raw"\p{Print}".r
-
-  def hexdump(s: String): Iterator[String] = {
-    var offset = 0
-    def hex(bytes: Array[Byte])   = bytes.map(b => f"$b%02x").mkString(" ")
-    def charFor(byte: Byte): Char = byte.toChar match { case c @ printable() => c ; case _ => '.' }
-    def ascii(bytes: Array[Byte]) = bytes.map(charFor).mkString
-    def format(bytes: Array[Byte]): String =
-      f"$offset%08x  ${hex(bytes.slice(0, 8))}%-24s ${hex(bytes.slice(8, 16))}%-24s |${ascii(bytes)}|"
-        .tap(_ => offset += bytes.length)
-    s.getBytes(codec.charSet).grouped(16).map(format)
-  }
 }
diff --git a/src/testkit/scala/tools/testkit/AssertUtil.scala b/src/testkit/scala/tools/testkit/AssertUtil.scala
@@ -51,6 +51,25 @@ object AssertUtil {
   // junit fail is Unit
   def fail(message: String): Nothing = throw new AssertionError(message)
 
+  private val printable = raw"\p{Print}".r
+
+  def hexdump(s: String): Iterator[String] = {
+    import scala.io.Codec
+    val codec: Codec = Codec.UTF8
+    var offset = 0
+    def hex(bytes: Array[Byte])   = bytes.map(b => f"$b%02x").mkString(" ")
+    def charFor(byte: Byte): Char = byte.toChar match { case c @ printable() => c ; case _ => '.' }
+    def ascii(bytes: Array[Byte]) = bytes.map(charFor).mkString
+    def format(bytes: Array[Byte]): String =
+      f"$offset%08x  ${hex(bytes.slice(0, 8))}%-24s ${hex(bytes.slice(8, 16))}%-24s |${ascii(bytes)}|"
+        .tap(_ => offset += bytes.length)
+    s.getBytes(codec.charSet).grouped(16).map(format)
+  }
+
+  private def dump(s: String) = hexdump(s).mkString("\n")
+  def assertEqualStrings(expected: String)(actual: String) =
+    assert(expected == actual, s"Expected:\n${dump(expected)}\nActual:\n${dump(actual)}")
+
   private final val timeout = 60 * 1000L                 // wait a minute
 
   private implicit class `ref helper`[A](val r: Reference[A]) extends AnyVal {

diff --git a/test/files/neg/surrogates.check b/test/files/neg/surrogates.check
@@ -0,0 +1,10 @@
+surrogates.scala:3: error: illegal codepoint in Char constant: '\ud801\udc00'
+  def `too wide for Char` = '𐐀'
+                            ^
+surrogates.scala:4: error: invalid string interpolation $?, expected: $$, $", $identifier or ${expression}
+  def `alpha required to start` = s"$𐐀"
+                                    ^
+surrogates.scala:4: error: unclosed string literal
+  def `alpha required to start` = s"$𐐀"
+                                       ^
+3 errors
diff --git a/test/files/neg/surrogates.scala b/test/files/neg/surrogates.scala
@@ -0,0 +1,5 @@
+
+class C {
+  def `too wide for Char` = '𐐀'
+  def `alpha required to start` = s"$𐐀"
+}