Merge pull request #8282 from martijnhoekstra/lostEscape

Un-special-case unicode escapes
scala · Mar 3, 2020 · ee8c1ef · ee8c1ef
2 parents 6f86e6b + 38270c8
commit ee8c1ef
Show file tree

Hide file tree

Showing 36 changed files with 483 additions and 216 deletions.
diff --git a/build.sbt b/build.sbt
@@ -99,6 +99,8 @@ val mimaFilterSettings = Seq {
     ProblemFilters.exclude[MissingClassProblem]("scala.reflect.macros.Attachments$"),
     ProblemFilters.exclude[DirectMissingMethodProblem]("scala.reflect.macros.Attachments.cloneAttachments"),
     ProblemFilters.exclude[DirectMissingMethodProblem]("scala.reflect.macros.NonemptyAttachments.cloneAttachments"),
+    ProblemFilters.exclude[DirectMissingMethodProblem]("scala.StringContext.processUnicode"),
+    ProblemFilters.exclude[MissingClassProblem]("scala.StringContext$InvalidUnicodeEscapeException"),
     ProblemFilters.exclude[DirectAbstractMethodProblem]("scala.collection.immutable.ArraySeq.stepper"),
     ProblemFilters.exclude[ReversedAbstractMethodProblem]("scala.collection.immutable.ArraySeq.stepper"),
     ProblemFilters.exclude[DirectAbstractMethodProblem]("scala.collection.mutable.ArraySeq.stepper"),

diff --git a/project/ScalaOptionParser.scala b/project/ScalaOptionParser.scala
@@ -83,7 +83,7 @@ object ScalaOptionParser {
 
   // TODO retrieve these data programmatically, ala https://github.com/scala/scala-tool-support/blob/master/bash-completion/src/main/scala/BashCompletion.scala
   private def booleanSettingNames = List("-X", "-Xcheckinit", "-Xdev", "-Xdisable-assertions", "-Xexperimental", "-Xfatal-warnings", "-Xlog-free-terms", "-Xlog-free-types", "-Xlog-implicit-conversions", "-Xlog-implicits", "-Xlog-reflective-calls",
-    "-Xno-forwarders", "-Xno-patmat-analysis", "-Xno-uescape", "-Xnojline", "-Xprint-pos", "-Xprint-types", "-Xprompt", "-Xresident", "-Xshow-phases", "-Xverify", "-Y",
+    "-Xno-forwarders", "-Xno-patmat-analysis", "-Xnojline", "-Xprint-pos", "-Xprint-types", "-Xprompt", "-Xresident", "-Xshow-phases", "-Xverify", "-Y",
     "-Ybreak-cycles", "-Ydebug", "-Ycompact-trees", "-YdisableFlatCpCaching", "-Ydoc-debug",
     "-Yide-debug",
     "-Yissue-debug", "-Ylog-classpath", "-Ymacro-debug-lite", "-Ymacro-debug-verbose", "-Ymacro-no-expand",

diff --git a/spec/01-lexical-syntax.md b/spec/01-lexical-syntax.md
@@ -14,21 +14,6 @@ otherwise mentioned, the following descriptions of Scala tokens refer
 to _Scala mode_, and literal characters ‘c’ refer to the ASCII fragment
 `\u0000` – `\u007F`.
 
-In Scala mode, _Unicode escapes_ are replaced by the corresponding
-Unicode character with the given hexadecimal code.
-
-```ebnf
-UnicodeEscape ::= ‘\’ ‘u’ {‘u’} hexDigit hexDigit hexDigit hexDigit
-hexDigit      ::= ‘0’ | … | ‘9’ | ‘A’ | … | ‘F’ | ‘a’ | … | ‘f’
-```
-
-<!--
-TODO scala/bug#4583: UnicodeEscape used to allow additional backslashes,
-and there is something in the code `evenSlashPrefix` that alludes to it,
-but I can't make it work nor can I imagine how this would make sense,
-so I removed it for now.
--->
-
 To construct tokens, characters are distinguished according to the following
 classes (Unicode general category given in parentheses):
 
@@ -54,8 +39,11 @@ plainid  ::=  upper idrest
            |  varid
            |  op
 id       ::=  plainid
-           |  ‘`’ { charNoBackQuoteOrNewline | UnicodeEscape | charEscapeSeq } ‘`’
+           |  ‘`’ { charNoBackQuoteOrNewline | escapeSeq } ‘`’
 idrest   ::=  {letter | digit} [‘_’ op]
+escapeSeq     ::= UnicodeEscape | charEscapeSeq
+UnicodeEscape ::= ‘\’ ‘u’ {‘u’} hexDigit hexDigit hexDigit hexDigit
+hexDigit      ::= ‘0’ | … | ‘9’ | ‘A’ | … | ‘F’ | ‘a’ | … | ‘f’
 ```
 
 There are three ways to form an identifier. First, an identifier can
@@ -427,37 +415,30 @@ members of type `Boolean`.
 ### Character Literals
 
 ```ebnf
-characterLiteral  ::=  ‘'’ (charNoQuoteOrNewline | UnicodeEscape | charEscapeSeq) ‘'’
+characterLiteral  ::=  ‘'’ (charNoQuoteOrNewline | escapeSeq) ‘'’
 ```
 
 A character literal is a single character enclosed in quotes.
 The character can be any Unicode character except the single quote
 delimiter or `\u000A` (LF) or `\u000D` (CR);
-or any Unicode character represented by either a
-[Unicode escape](01-lexical-syntax.html) or by an [escape sequence](#escape-sequences).
+or any Unicode character represented by an
+[escape sequence](#escape-sequences).
 
 > ```scala
 > 'a'    '\u0041'    '\n'    '\t'
 > ```
 
-Note that although Unicode conversion is done early during parsing,
-so that Unicode characters are generally equivalent to their escaped
-expansion in the source text, literal parsing accepts arbitrary
-Unicode escapes, including the character literal `'\u000A'`,
-which can also be written using the escape sequence `'\n'`.
-
 ### String Literals
 
 ```ebnf
 stringLiteral  ::=  ‘"’ {stringElement} ‘"’
-stringElement  ::=  charNoDoubleQuoteOrNewline | UnicodeEscape | charEscapeSeq
+stringElement  ::=  charNoDoubleQuoteOrNewline | escapeSeq
 ```
 
 A string literal is a sequence of characters in double quotes.
 The characters can be any Unicode character except the double quote
 delimiter or `\u000A` (LF) or `\u000D` (CR);
-or any Unicode character represented by either a
-[Unicode escape](01-lexical-syntax.html) or by an [escape sequence](#escape-sequences).
+or any Unicode character represented by an [escape sequence](#escape-sequences).
 
 If the string literal contains a double quote character, it must be escaped using
 `"\""`.
@@ -481,8 +462,8 @@ triple quotes `""" ... """`. The sequence of characters is
 arbitrary, except that it may contain three or more consecutive quote characters
 only at the very end. Characters
 must not necessarily be printable; newlines or other
-control characters are also permitted.  Unicode escapes work as everywhere else, but none
-of the escape sequences [here](#escape-sequences) are interpreted.
+control characters are also permitted. [Escape sequences](#escape-sequences) are
+not processed, except for Unicode escapes.
 
 > ```scala
 >   """the present string
@@ -569,7 +550,7 @@ implicit class StringInterpolation(s: StringContext) {
 
 ### Escape Sequences
 
-The following escape sequences are recognized in character and string literals.
+The following character escape sequences are recognized in character and string literals.
 
 | charEscapeSeq | unicode  | name            | char   |
 |---------------|----------|-----------------|--------|
@@ -582,6 +563,9 @@ The following escape sequences are recognized in character and string literals.
 | `‘\‘ ‘'‘`     | `\u0027` | single quote    |  `'`   |
 | `‘\‘ ‘\‘`     | `\u005c` | backslash       |  `\`   |
 
+In addition, Unicode escape sequences of the form `\uxxxx`, where each `x` is a hex digit are
+recognized in character and string literals.
+
 It is a compile time error if a backslash character in a character or
 string literal does not start a valid escape sequence.
 

diff --git a/spec/13-syntax-summary.md b/spec/13-syntax-summary.md
@@ -8,13 +8,6 @@ chapter: 13
 
 The following descriptions of Scala tokens uses literal characters `‘c’` when referring to the ASCII fragment `\u0000` – `\u007F`.
 
-_Unicode escapes_ are used to represent the Unicode character with the given hexadecimal code:
-
-```ebnf
-UnicodeEscape ::=  ‘\’ ‘u’ {‘u’} hexDigit hexDigit hexDigit hexDigit
-hexDigit      ::=  ‘0’ | … | ‘9’ | ‘A’ | … | ‘F’ | ‘a’ | … | ‘f’
-```
-
 ## Lexical Syntax
 
 The lexical syntax of Scala is given by the following grammar in EBNF form:
@@ -30,8 +23,10 @@ delim            ::=  ‘`’ | ‘'’ | ‘"’ | ‘.’ | ‘;’ | ‘,’
 opchar           ::=  // printableChar not matched by (whiteSpace | upper | lower |
                       // letter | digit | paren | delim | opchar | Unicode_Sm | Unicode_So)
 printableChar    ::=  // all characters in [\u0020, \u007F] inclusive
+UnicodeEscape    ::=  ‘\’ ‘u’ {‘u’} hexDigit hexDigit hexDigit hexDigit
+hexDigit         ::=  ‘0’ | … | ‘9’ | ‘A’ | … | ‘F’ | ‘a’ | … | ‘f’
 charEscapeSeq    ::=  ‘\’ (‘b’ | ‘t’ | ‘n’ | ‘f’ | ‘r’ | ‘"’ | ‘'’ | ‘\’)
-
+escapeSeq        ::=  UnicodeEscape | charEscapeSeq
 op               ::=  opchar {opchar}
 varid            ::=  lower idrest
 boundvarid       ::=  varid
@@ -40,7 +35,7 @@ plainid          ::=  upper idrest
                    |  varid
                    |  op
 id               ::=  plainid
-                   |  ‘`’ { charNoBackQuoteOrNewline | UnicodeEscape | charEscapeSeq } ‘`’
+                   |  ‘`’ { charNoBackQuoteOrNewline | escapeSeq } ‘`’
 idrest           ::=  {letter | digit} [‘_’ op]
 
 integerLiteral   ::=  (decimalNumeral | hexNumeral) [‘L’ | ‘l’]
@@ -57,13 +52,12 @@ floatType        ::=  ‘F’ | ‘f’ | ‘D’ | ‘d’
 
 booleanLiteral   ::=  ‘true’ | ‘false’
 
-characterLiteral ::=  ‘'’ (charNoQuoteOrNewline | UnicodeEscape | charEscapeSeq) ‘'’
+characterLiteral ::=  ‘'’ (charNoQuoteOrNewline | escapeSeq) ‘'’
 
 stringLiteral    ::=  ‘"’ {stringElement} ‘"’
                    |  ‘"""’ multiLineChars ‘"""’
 stringElement    ::=  charNoDoubleQuoteOrNewline
-                   |  UnicodeEscape
-                   |  charEscapeSeq
+                   |  escapeSeq
 multiLineChars   ::=  {[‘"’] [‘"’] charNoDoubleQuote} {‘"’}
 
 interpolatedString 

diff --git a/spec/15-changelog.md b/spec/15-changelog.md
@@ -6,6 +6,12 @@ chapter: 15
 
 # Changelog
 
+Changes in Version 2.13.1
+
+#### Unicode normalization
+
+Unicode escapes are no longer pre-processed, but are processed as regular escapes.
+
 Changes in Version 2.8.0
 ------------------------
 

diff --git a/src/compiler/scala/tools/nsc/ast/parser/Scanners.scala b/src/compiler/scala/tools/nsc/ast/parser/Scanners.scala
@@ -237,10 +237,7 @@ trait Scanners extends ScannersCommon {
 
     /** append Unicode character to "cbuf" buffer
      */
-    protected def putChar(c: Char): Unit = {
-//      assert(cbuf.size < 10000, cbuf)
-      cbuf.append(c)
-    }
+    protected def putChar(c: Char): Unit = cbuf.append(c)
 
     /** Determines whether this scanner should emit identifier deprecation warnings,
      *  e.g. when seeing `macro` or `then`, which are planned to become keywords in future versions of Scala.
@@ -653,7 +650,7 @@ trait Scanners extends ScannersCommon {
               charLitOr(() => getIdentRest())
             else if (isOperatorPart(ch) && (ch != '\\'))
               charLitOr(() => getOperatorRest())
-            else if (!isAtEnd && (ch != SU && ch != CR && ch != LF || isUnicodeEscape)) {
+            else if (!isAtEnd && (ch != SU && ch != CR && ch != LF)) {
               val isEmptyCharLit = (ch == '\'')
               getLitChar()
               if (ch == '\'') {
@@ -843,11 +840,28 @@ trait Scanners extends ScannersCommon {
 
     private def unclosedStringLit(): Unit = syntaxError("unclosed string literal")
 
+    private def replaceUnicodeEscapesInTriple(): Unit = 
+      if(strVal != null) {
+        try {
+          val replaced = StringContext.processUnicode(strVal)
+          if(replaced != strVal) {
+            val diffPosition = replaced.zip(strVal).zipWithIndex.collectFirst{ case ((r, o), i) if r != o => i}.getOrElse(replaced.length - 1)
+            deprecationWarning(offset + 3 + diffPosition, "Unicode escapes in triple quoted strings are deprecated, use the literal character instead", since="2.13.2")
+          }
+          strVal = replaced
+        } catch {
+          case ue: StringContext.InvalidUnicodeEscapeException => {
+            syntaxError(offset + 3 + ue.index, ue.getMessage())
+          }
+        }
+      }
+
     @tailrec private def getRawStringLit(): Unit = {
       if (ch == '\"') {
         nextRawChar()
         if (isTripleQuote()) {
           setStrVal()
+          if(!currentRun.isScala214) replaceUnicodeEscapesInTriple()
           token = STRINGLIT
         } else
           getRawStringLit()
@@ -911,7 +925,7 @@ trait Scanners extends ScannersCommon {
           syntaxError(s"invalid string interpolation $$$ch, expected: $$$$, $$identifier or $${expression}")
         }
       } else {
-        val isUnclosedLiteral = !isUnicodeEscape && (ch == SU || (!multiLine && (ch == CR || ch == LF)))
+        val isUnclosedLiteral = (ch == SU || (!multiLine && (ch == CR || ch == LF)))
         if (isUnclosedLiteral) {
           if (multiLine)
             incompleteInputError("unclosed multi-line string literal")
@@ -974,31 +988,64 @@ trait Scanners extends ScannersCommon {
           syntaxError(start, s"octal escape literals are unsupported: use $alt instead")
           putChar(oct.toChar)
         } else {
-          ch match {
-            case 'b'  => putChar('\b')
-            case 't'  => putChar('\t')
-            case 'n'  => putChar('\n')
-            case 'f'  => putChar('\f')
-            case 'r'  => putChar('\r')
-            case '\"' => putChar('\"')
-            case '\'' => putChar('\'')
-            case '\\' => putChar('\\')
-            case _    => invalidEscape()
+          if (ch == 'u') {
+            if (getUEscape()) nextChar()
+          }
+          else {
+            ch match {
+              case 'b'  => putChar('\b')
+              case 't'  => putChar('\t')
+              case 'n'  => putChar('\n')
+              case 'f'  => putChar('\f')
+              case 'r'  => putChar('\r')
+              case '\"' => putChar('\"')
+              case '\'' => putChar('\'')
+              case '\\' => putChar('\\')
+              case _    => invalidEscape()
+            }
+            nextChar()
           }
-          nextChar()
         }
       } else  {
         putChar(ch)
         nextChar()
       }
 
+    private def getUEscape(): Boolean = {
+      while (ch == 'u') nextChar()
+      var codepoint = 0
+      var digitsRead = 0
+      while (digitsRead < 4) {
+        if (digitsRead > 0) nextChar()
+        val digit = digit2int(ch, 16)
+        digitsRead += 1
+        if (digit >= 0) {
+          codepoint = codepoint << 4
+          codepoint += digit
+        }
+        else {
+          invalidUnicodeEscape(digitsRead)
+          return false
+        }
+      }
+      val found = codepoint.asInstanceOf[Char]
+      putChar(found)
+      true
+    }
+
+
     protected def invalidEscape(): Unit = {
       syntaxError(charOffset - 1, "invalid escape character")
       putChar(ch)
     }
 
+    protected def invalidUnicodeEscape(n: Int): Unit = {
+      syntaxError(charOffset - n, "invalid unicode escape")
+      putChar(ch)
+    }
+
     private def getLitChars(delimiter: Char) = {
-      while (ch != delimiter && !isAtEnd && (ch != SU && ch != CR && ch != LF || isUnicodeEscape))
+      while (ch != delimiter && !isAtEnd && (ch != SU && ch != CR && ch != LF))
         getLitChar()
     }
 
@@ -1391,7 +1438,6 @@ trait Scanners extends ScannersCommon {
    */
   class SourceFileScanner(val source: SourceFile) extends Scanner {
     val buf = source.content
-    override val decodeUni: Boolean = !settings.nouescape
 
     // suppress warnings, throw exception on errors
     def deprecationWarning(off: Offset, msg: String, since: String): Unit = ()

diff --git a/src/compiler/scala/tools/nsc/javac/JavaScanners.scala b/src/compiler/scala/tools/nsc/javac/JavaScanners.scala
@@ -881,7 +881,7 @@ trait JavaScanners extends ast.parser.ScannersCommon {
   }
 
   class JavaUnitScanner(unit: CompilationUnit) extends JavaScanner {
-    in = new JavaCharArrayReader(new ArraySeq.ofChar(unit.source.content), !settings.nouescape.value, syntaxError)
+    in = new JavaCharArrayReader(new ArraySeq.ofChar(unit.source.content), true, syntaxError)
     init()
     def error(pos: Int, msg: String) = reporter.error(pos, msg)
     def incompleteInputError(pos: Int, msg: String) = currentRun.parsing.incompleteInputError(pos, msg)

diff --git a/src/compiler/scala/tools/nsc/settings/ScalaSettings.scala b/src/compiler/scala/tools/nsc/settings/ScalaSettings.scala
@@ -120,7 +120,6 @@ trait ScalaSettings extends StandardScalaSettings with Warnings {
   val maxerrs            = IntSetting          ("-Xmaxerrs", "Maximum errors to print", 100, None, _ => None)
   val maxwarns           = IntSetting          ("-Xmaxwarns", "Maximum warnings to print", 100, None, _ => None)
   val Xmigration         = ScalaVersionSetting ("-Xmigration", "version", "Warn about constructs whose behavior may have changed since version.", initial = NoScalaVersion, default = Some(AnyScalaVersion))
-  val nouescape          = BooleanSetting      ("-Xno-uescape", "Disable handling of \\u unicode escapes.")
   val Xnojline           = BooleanSetting      ("-Xnojline", "Do not use JLine for editing.")
   val Xverify            = BooleanSetting      ("-Xverify", "Verify generic signatures in generated bytecode.")
   val plugin             = MultiStringSetting  ("-Xplugin", "paths", "Load a plugin from each classpath.")