Merge pull request #9687 from som-snytt/issue/1406

Accept supplementary Unicode characters in source code
scala · Aug 2, 2021 · 3f06ac7 · 3f06ac7
2 parents 3081265 + b124a54
commit 3f06ac7
Show file tree

Hide file tree

Showing 19 changed files with 287 additions and 107 deletions.
diff --git a/build.sbt b/build.sbt
@@ -723,6 +723,7 @@ lazy val junit = project.in(file("test") / "junit")
       "-feature",
       "-Xlint:-valpattern,_",
       "-Wconf:msg=match may not be exhaustive:s", // if we missed a case, all that happens is the test fails
+      "-Wconf:cat=lint-nullary-unit&site=.*Test:s", // normal unit test style
       "-Ypatmat-exhaust-depth", "40", // despite not caring about patmat exhaustiveness, we still get warnings for this
     ),
     Compile / javacOptions ++= Seq("-Xlint"),

diff --git a/spec/01-lexical-syntax.md b/spec/01-lexical-syntax.md
@@ -6,13 +6,11 @@ chapter: 1
 
 # Lexical Syntax
 
-Scala programs are written using the Unicode Basic Multilingual Plane
-(_BMP_) character set; Unicode supplementary characters are not
-presently supported.  This chapter defines the two modes of Scala's
-lexical syntax, the Scala mode, and the _XML mode_. If not
-otherwise mentioned, the following descriptions of Scala tokens refer
-to _Scala mode_, and literal characters ‘c’ refer to the ASCII fragment
-`\u0000` – `\u007F`.
+Scala source code consists of Unicode text.
+
+The program text is tokenized as described in this chapter.
+See the last section for special support for XML literals,
+which are parsed in _XML mode_.
 
 To construct tokens, characters are distinguished according to the following
 classes (Unicode general category given in parentheses):
@@ -74,7 +72,7 @@ or `_`, and _constant identifiers_, which do not.
 For this purpose, lower case letters include not only a-z,
 but also all characters in Unicode category Ll (lowercase letter),
 as well as all letters that have contributory property
-Other_Lowercase, except characters in category Nl (letter numerals)
+Other_Lowercase, except characters in category Nl (letter numerals),
 which are never taken as lower case.
 
 The following are examples of variable identifiers:

diff --git a/spec/06-expressions.md b/spec/06-expressions.md
@@ -659,7 +659,7 @@ character. Characters are listed below in increasing order of
 precedence, with characters on the same line having the same precedence.
 
 ```scala
-(all letters)
+(all letters, as defined in [chapter 1](01-lexical-syntax.html), including `_` and `$`)
 |
 ^
 &
@@ -668,7 +668,7 @@ precedence, with characters on the same line having the same precedence.
 :
 + -
 * / %
-(all other special characters)
+(other operator characters, as defined in [chapter 1](01-lexical-syntax.html), including Unicode categories `Sm` and `So`)
 ```
 
 That is, operators starting with a letter have lowest precedence,

diff --git a/src/compiler/scala/tools/nsc/ast/parser/Parsers.scala b/src/compiler/scala/tools/nsc/ast/parser/Parsers.scala
@@ -264,7 +264,7 @@ self =>
       if (syntaxErrors.isEmpty) firstTry
       else in.healBraces() match {
         case Nil      => showSyntaxErrors() ; firstTry
-        case patches  => (this withPatches patches).parse()
+        case patches  => withPatches(patches).parse()
       }
     }
   }

diff --git a/src/compiler/scala/tools/nsc/ast/parser/Scanners.scala b/src/compiler/scala/tools/nsc/ast/parser/Scanners.scala
@@ -172,7 +172,45 @@ trait Scanners extends ScannersCommon {
     /** A switch whether operators at the start of lines can be infix operators. */
     private var allowLeadingInfixOperators = true
 
-    private def isDigit(c: Char) = java.lang.Character isDigit c
+    private def isDigit(c: Char) = Character.isDigit(c)
+
+    import Character.{isHighSurrogate, isLowSurrogate, isUnicodeIdentifierPart, isUnicodeIdentifierStart, isValidCodePoint, toCodePoint}
+
+    // given char (ch) is high surrogate followed by low, codepoint passes predicate.
+    // true means supplementary chars were put to buffer.
+    // strict to require low surrogate (if not in string literal).
+    private def isSupplementary(high: Char, test: Int => Boolean, strict: Boolean = true): Boolean =
+      isHighSurrogate(high) && {
+        var res = false
+        nextChar()
+        val low = ch
+        if (isLowSurrogate(low)) {
+          nextChar()
+          val codepoint = toCodePoint(high, low)
+          if (isValidCodePoint(codepoint) && test(codepoint)) {
+            putChar(high)
+            putChar(low)
+            res = true
+          } else
+            syntaxError(f"illegal character '\\u$high%04x\\u$low%04x'")
+        } else if (!strict) {
+          putChar(high)
+          res = true
+        } else
+          syntaxError(f"illegal character '\\u$high%04x' missing low surrogate")
+        res
+      }
+    private def atSupplementary(ch: Char, f: Int => Boolean): Boolean =
+      isHighSurrogate(ch) && {
+        val hi = ch
+        val r = lookaheadReader
+        r.nextRawChar()
+        val lo = r.ch
+        isLowSurrogate(lo) && {
+          val codepoint = toCodePoint(hi, lo)
+          isValidCodePoint(codepoint) && f(codepoint)
+        }
+      }
 
     private var openComments = 0
     final protected def putCommentChar(): Unit = { processCommentChar(); nextChar() }
@@ -705,14 +743,18 @@ trait Scanners extends ScannersCommon {
                   syntaxError("empty character literal (use '\\'' for single quote)")
                 else {
                   nextChar()
-                  token = CHARLIT
-                  setStrVal()
+                  if (cbuf.length != 1)
+                    syntaxError("illegal codepoint in Char constant: " + cbuf.toString.map(c => f"\\u$c%04x").mkString("'", "", "'"))
+                  else {
+                    token = CHARLIT
+                    setStrVal()
+                  }
                 }
-              } else if (isEmptyCharLit) {
+              }
+              else if (isEmptyCharLit)
                 syntaxError("empty character literal")
-              } else {
+              else
                 unclosedCharLit()
-              }
             }
             else unclosedCharLit()
           }
@@ -755,16 +797,18 @@ trait Scanners extends ScannersCommon {
             } else if (ch == '\u2190') {
               deprecationWarning("The unicode arrow `←` is deprecated, use `<-` instead. If you still wish to display it as one character, consider using a font with programming ligatures such as Fira Code.", "2.13.0")
               nextChar(); token = LARROW
-            } else if (Character.isUnicodeIdentifierStart(ch)) {
+            } else if (isUnicodeIdentifierStart(ch)) {
               putChar(ch)
               nextChar()
               getIdentRest()
             } else if (isSpecial(ch)) {
               putChar(ch)
               nextChar()
               getOperatorRest()
+            } else if (isSupplementary(ch, isUnicodeIdentifierStart)) {
+              getIdentRest()
             } else {
-              syntaxError("illegal character '" + ("" + '\\' + 'u' + "%04x".format(ch.toInt)) + "'")
+              syntaxError(f"illegal character '\\u$ch%04x'")
               nextChar()
             }
           }
@@ -831,13 +875,15 @@ trait Scanners extends ScannersCommon {
       case SU => // strangely enough, Character.isUnicodeIdentifierPart(SU) returns true!
         finishNamed()
       case _ =>
-        if (Character.isUnicodeIdentifierPart(ch)) {
+        if (isUnicodeIdentifierPart(ch)) {
           putChar(ch)
           nextChar()
           getIdentRest()
-        } else {
-          finishNamed()
         }
+        else if (isSupplementary(ch, isUnicodeIdentifierPart))
+          getIdentRest()
+        else
+          finishNamed()
     }
 
     @tailrec
@@ -955,6 +1001,25 @@ trait Scanners extends ScannersCommon {
         }
         getStringPart(multiLine, seenEscapedQuote || q)
       } else if (ch == '$') {
+        @tailrec def getInterpolatedIdentRest(): Unit =
+          if (ch != SU && isUnicodeIdentifierPart(ch)) {
+            putChar(ch)
+            nextRawChar()
+            getInterpolatedIdentRest()
+          } else if (atSupplementary(ch, isUnicodeIdentifierPart)) {
+            putChar(ch)
+            nextRawChar()
+            putChar(ch)
+            nextRawChar()
+            getInterpolatedIdentRest()
+          } else {
+            next.token = IDENTIFIER
+            next.name = newTermName(cbuf.toCharArray)
+            cbuf.clear()
+            val idx = next.name.start - kwOffset
+            if (idx >= 0 && idx < kwArray.length)
+              next.token = kwArray(idx)
+          }
         nextRawChar()
         if (ch == '$' || ch == '"') {
           putChar(ch)
@@ -968,32 +1033,29 @@ trait Scanners extends ScannersCommon {
           finishStringPart()
           nextRawChar()
           next.token = USCORE
-        } else if (Character.isUnicodeIdentifierStart(ch)) {
+        } else if (isUnicodeIdentifierStart(ch)) {
           finishStringPart()
-          do {
-            putChar(ch)
-            nextRawChar()
-          } while (ch != SU && Character.isUnicodeIdentifierPart(ch))
-          next.token = IDENTIFIER
-          next.name = newTermName(cbuf.toString)
-          cbuf.clear()
-          val idx = next.name.start - kwOffset
-          if (idx >= 0 && idx < kwArray.length) {
-            next.token = kwArray(idx)
-          }
+          putChar(ch)
+          nextRawChar()
+          getInterpolatedIdentRest()
+        } else if (atSupplementary(ch, isUnicodeIdentifierStart)) {
+          finishStringPart()
+          putChar(ch)
+          nextRawChar()
+          putChar(ch)
+          nextRawChar()
+          getInterpolatedIdentRest()
         } else {
           val expectations = "$$, $\", $identifier or ${expression}"
           syntaxError(s"invalid string interpolation $$$ch, expected: $expectations")
         }
       } else {
         val isUnclosedLiteral = (ch == SU || (!multiLine && (ch == CR || ch == LF)))
-        if (isUnclosedLiteral) {
+        if (isUnclosedLiteral)
           if (multiLine)
             incompleteInputError("unclosed multi-line string literal")
-          else {
+          else
             unclosedStringLit(seenEscapedQuote)
-          }
-        }
         else {
           putChar(ch)
           nextRawChar()
@@ -1027,53 +1089,38 @@ trait Scanners extends ScannersCommon {
         false
       }
 
-    /** copy current character into cbuf, interpreting any escape sequences,
-     *  and advance to next character.
+    /** Copy current character into cbuf, interpreting any escape sequences,
+     *  and advance to next character. Surrogate pairs are consumed (see check
+     *  at fetchSingleQuote), but orphan surrogate is allowed.
      */
     protected def getLitChar(): Unit =
       if (ch == '\\') {
         nextChar()
-        if ('0' <= ch && ch <= '7') {
-          val start = charOffset - 2
-          val leadch: Char = ch
-          var oct: Int = digit2int(ch, 8)
-          nextChar()
-          if ('0' <= ch && ch <= '7') {
-            oct = oct * 8 + digit2int(ch, 8)
-            nextChar()
-            if (leadch <= '3' && '0' <= ch && ch <= '7') {
-              oct = oct * 8 + digit2int(ch, 8)
-              nextChar()
-            }
-          }
-          val alt = if (oct == LF) "\\n" else "\\u%04x" format oct
-          syntaxError(start, s"octal escape literals are unsupported: use $alt instead")
-          putChar(oct.toChar)
-        } else {
-          if (ch == 'u') {
-            if (getUEscape()) nextChar()
-          }
-          else {
-            ch match {
-              case 'b'  => putChar('\b')
-              case 't'  => putChar('\t')
-              case 'n'  => putChar('\n')
-              case 'f'  => putChar('\f')
-              case 'r'  => putChar('\r')
-              case '\"' => putChar('\"')
-              case '\'' => putChar('\'')
-              case '\\' => putChar('\\')
-              case _    => invalidEscape()
-            }
-            nextChar()
-          }
-        }
-      } else  {
+        charEscape()
+      } else if (!isSupplementary(ch, _ => true, strict = false)) {
         putChar(ch)
         nextChar()
       }
 
-    private def getUEscape(): Boolean = {
+    private def charEscape(): Unit = {
+      var bump = true
+      ch match {
+        case 'b'  => putChar('\b')
+        case 't'  => putChar('\t')
+        case 'n'  => putChar('\n')
+        case 'f'  => putChar('\f')
+        case 'r'  => putChar('\r')
+        case '\"' => putChar('\"')
+        case '\'' => putChar('\'')
+        case '\\' => putChar('\\')
+        case 'u'  => bump = uEscape()
+        case x if '0' <= x && x <= '7' => bump = octalEscape()
+        case _    => invalidEscape()
+      }
+      if (bump) nextChar()
+    }
+
+    private def uEscape(): Boolean = {
       while (ch == 'u') nextChar()
       var codepoint = 0
       var digitsRead = 0
@@ -1094,7 +1141,25 @@ trait Scanners extends ScannersCommon {
       putChar(found)
       true
     }
-
+
+    private def octalEscape(): Boolean = {
+      val start = charOffset - 2
+      val leadch: Char = ch
+      var oct: Int = digit2int(ch, 8)
+      nextChar()
+      if ('0' <= ch && ch <= '7') {
+        oct = oct * 8 + digit2int(ch, 8)
+        nextChar()
+        if (leadch <= '3' && '0' <= ch && ch <= '7') {
+          oct = oct * 8 + digit2int(ch, 8)
+          nextChar()
+        }
+      }
+      val alt = if (oct == LF) "\\n" else f"\\u$oct%04x"
+      syntaxError(start, s"octal escape literals are unsupported: use $alt instead")
+      putChar(oct.toChar)
+      false
+    }
 
     protected def invalidEscape(): Unit = {
       syntaxError(charOffset - 1, "invalid escape character")

diff --git a/src/compiler/scala/tools/nsc/symtab/classfile/AbstractFileReader.scala b/src/compiler/scala/tools/nsc/symtab/classfile/AbstractFileReader.scala
@@ -27,9 +27,7 @@ import scala.tools.nsc.io.AbstractFile
  */
 final class AbstractFileReader(val buf: Array[Byte]) extends DataReader {
   @deprecated("Use other constructor", "2.13.0")
-  def this(file: AbstractFile) = {
-    this(file.toByteArray)
-  }
+  def this(file: AbstractFile) = this(file.toByteArray)
 
   /** the current input pointer
    */
@@ -67,9 +65,8 @@ final class AbstractFileReader(val buf: Array[Byte]) extends DataReader {
   def getByte(mybp: Int): Byte =
     buf(mybp)
 
-  def getBytes(mybp: Int, bytes: Array[Byte]): Unit = {
+  def getBytes(mybp: Int, bytes: Array[Byte]): Unit =
     System.arraycopy(buf, mybp, bytes, 0, bytes.length)
-  }
 
   /** extract a character at position bp from buf
    */
@@ -95,9 +92,8 @@ final class AbstractFileReader(val buf: Array[Byte]) extends DataReader {
    */
   def getDouble(mybp: Int): Double = longBitsToDouble(getLong(mybp))
 
-  def getUTF(mybp: Int, len: Int): String = {
+  def getUTF(mybp: Int, len: Int): String =
     new DataInputStream(new ByteArrayInputStream(buf, mybp, len)).readUTF
-  }
 
   /** skip next 'n' bytes
    */

diff --git a/src/partest/scala/tools/partest/DirectTest.scala b/src/partest/scala/tools/partest/DirectTest.scala
@@ -45,6 +45,7 @@ abstract class DirectTest {
   protected def pathOf(locations: String*) = locations.mkString(sys.props("path.separator"))
 
   // override to add additional settings besides -d testOutput.path
+  // default is -usejavacp
   def extraSettings: String = "-usejavacp"
   // a default Settings object using only extraSettings
   def settings: Settings = newSettings(CommandLineParser.tokenize(extraSettings))