Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve support for Unicode supplementary characters in identifiers and string interpolation #9805

Merged
merged 2 commits into from Mar 17, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
8 changes: 4 additions & 4 deletions spec/01-lexical-syntax.md
Expand Up @@ -506,7 +506,7 @@ interpolatedString ::= alphaid ‘"’ {[‘\’] interpolatedStringPart |
interpolatedStringPart ::= printableChar \ (‘"’ | ‘$’ | ‘\’) | escape
escape ::= ‘$$’
| ‘$"’
| ‘$’ id
| ‘$’ alphaid
| ‘$’ BlockExpr
alphaid ::= upper idrest
| varid
Expand All @@ -533,9 +533,9 @@ in an interpolated string. A single ‘$’-sign can still be obtained by doubli
character: ‘$$’. A single ‘"’-sign can be obtained by the sequence ‘\$"’.

The simpler form consists of a ‘$’-sign followed by an identifier starting with
a letter and followed only by letters, digits, and underscore characters,
e.g `$id`. The simpler form is expanded by putting braces around the identifier,
e.g `$id` is equivalent to `${id}`. In the following, unless we explicitly state otherwise,
a letter and followed only by letters, digits, and underscore characters, e.g., `$id`.
The simpler form is expanded by putting braces around the identifier,
e.g., `$id` is equivalent to `${id}`. In the following, unless we explicitly state otherwise,
we assume that this expansion has already been performed.

The expanded expression is type checked normally. Usually, `StringContext` will resolve to
Expand Down
4 changes: 2 additions & 2 deletions spec/13-syntax-summary.md
Expand Up @@ -15,7 +15,7 @@ The lexical syntax of Scala is given by the following grammar in EBNF form:
```ebnf
whiteSpace ::= ‘\u0020’ | ‘\u0009’ | ‘\u000D’ | ‘\u000A’
upper ::= ‘A’ | … | ‘Z’ | ‘$’ and any character in Unicode categories Lu, Lt or Nl,
and any character in Unicode categories Lo and Lm that don't have
and any character in Unicode categories Lo and Lm that doesn't have
contributory property Other_Lowercase
lower ::= ‘a’ | … | ‘z’ | ‘_’ and any character in Unicode category Ll,
and any character in Unicode categories Lo or Lm that has contributory
Expand Down Expand Up @@ -72,7 +72,7 @@ interpolatedStringPart
::= printableChar \ (‘"’ | ‘$’ | ‘\’) | escape
escape ::= ‘\$\$’
| ‘\$"’
| ‘\$’ id
| ‘\$’ alphaid
| ‘\$’ BlockExpr
alphaid ::= upper idrest
| varid
Expand Down
107 changes: 51 additions & 56 deletions src/compiler/scala/tools/nsc/ast/parser/Scanners.scala
Expand Up @@ -182,22 +182,26 @@ trait Scanners extends ScannersCommon {
private def isSupplementary(high: Char, test: Int => Boolean, strict: Boolean = true): Boolean =
isHighSurrogate(high) && {
var res = false
nextChar()
val low = ch
val low = lookaheadReader.getc()
if (isLowSurrogate(low)) {
nextChar()
val codepoint = toCodePoint(high, low)
if (isValidCodePoint(codepoint) && test(codepoint)) {
putChar(high)
putChar(low)
res = true
} else
syntaxError(f"illegal character '\\u$high%04x\\u$low%04x'")
} else if (!strict) {
val codePoint = toCodePoint(high, low)
if (isValidCodePoint(codePoint)) {
if (test(codePoint)) {
putChar(high)
putChar(low)
nextChar()
nextChar()
res = true
}
}
else syntaxError(f"illegal character '\\u$high%04x\\u$low%04x'")
}
else if (!strict) {
putChar(high)
nextChar()
res = true
} else
syntaxError(f"illegal character '\\u$high%04x' missing low surrogate")
}
else syntaxError(f"illegal character '\\u$high%04x' missing low surrogate")
res
}
private def atSupplementary(ch: Char, f: Int => Boolean): Boolean =
Expand Down Expand Up @@ -621,8 +625,7 @@ trait Scanners extends ScannersCommon {
putChar(ch)
nextChar()
getIdentRest()
if (ch == '"' && token == IDENTIFIER)
token = INTERPOLATIONID
if (ch == '"' && token == IDENTIFIER) token = INTERPOLATIONID
case '<' => // is XMLSTART?
def fetchLT() = {
val last = if (charOffset >= 2) buf(charOffset - 2) else ' '
Expand Down Expand Up @@ -729,12 +732,31 @@ trait Scanners extends ScannersCommon {
}
syntaxError(msg)
}
/** Either at closing quote of charlit
* or run the op and take it as a (deprecated) Symbol identifier.
*/
def charLitOrSymbolAfter(op: () => Unit): Unit =
if (ch == '\'') {
nextChar()
token = CHARLIT
setStrVal()
} else {
op()
token = SYMBOLLIT
strVal = name.toString
}
def fetchSingleQuote() = {
nextChar()
if (isIdentifierStart(ch))
charLitOr(() => getIdentRest())
else if (isOperatorPart(ch) && (ch != '\\'))
charLitOr(() => getOperatorRest())
if (isIdentifierStart(ch)) {
putChar(ch)
nextChar()
charLitOrSymbolAfter(() => getIdentRest())
}
else if (isOperatorPart(ch) && (ch != '\\')) {
putChar(ch)
nextChar()
charLitOrSymbolAfter(() => getOperatorRest())
}
else if (!isAtEnd && (ch != SU && ch != CR && ch != LF)) {
val isEmptyCharLit = (ch == '\'')
getLitChar()
Expand Down Expand Up @@ -801,12 +823,16 @@ trait Scanners extends ScannersCommon {
putChar(ch)
nextChar()
getIdentRest()
if (ch == '"' && token == IDENTIFIER) token = INTERPOLATIONID
} else if (isSpecial(ch)) {
putChar(ch)
nextChar()
getOperatorRest()
} else if (isSupplementary(ch, isUnicodeIdentifierStart)) {
getIdentRest()
if (ch == '"' && token == IDENTIFIER) token = INTERPOLATIONID
} else if (isSupplementary(ch, isSpecial)) {
getOperatorRest()
} else {
syntaxError(f"illegal character '\\u$ch%04x'")
nextChar()
Expand Down Expand Up @@ -872,7 +898,8 @@ trait Scanners extends ScannersCommon {
putChar(ch)
nextChar()
getIdentOrOperatorRest()
case SU => // strangely enough, Character.isUnicodeIdentifierPart(SU) returns true!
case ' ' | LF | // optimize for common whitespace
SU => // strangely enough, Character.isUnicodeIdentifierPart(SU) returns true!
finishNamed()
case _ =>
if (isUnicodeIdentifierPart(ch)) {
Expand All @@ -888,6 +915,7 @@ trait Scanners extends ScannersCommon {

@tailrec
private def getOperatorRest(): Unit = (ch: @switch) match {
case ' ' | LF => finishNamed() // optimize
case '~' | '!' | '@' | '#' | '%' |
'^' | '*' | '+' | '-' | '<' |
'>' | '?' | ':' | '=' | '&' |
Expand All @@ -899,24 +927,12 @@ trait Scanners extends ScannersCommon {
else { putChar('/'); getOperatorRest() }
case _ =>
if (isSpecial(ch)) { putChar(ch); nextChar(); getOperatorRest() }
else if (isSupplementary(ch, isSpecial)) getOperatorRest()
som-snytt marked this conversation as resolved.
Show resolved Hide resolved
else finishNamed()
}

private def getIdentOrOperatorRest(): Unit = {
if (isIdentifierPart(ch))
getIdentRest()
else ch match {
case '~' | '!' | '@' | '#' | '%' |
'^' | '*' | '+' | '-' | '<' |
'>' | '?' | ':' | '=' | '&' |
'|' | '\\' | '/' =>
getOperatorRest()
case _ =>
if (isSpecial(ch)) getOperatorRest()
else finishNamed()
}
}

private def getIdentOrOperatorRest(): Unit =
if (isIdentifierPart(ch) || isSupplementary(ch, isIdentifierPart)) getIdentRest() else getOperatorRest()

// Literals -----------------------------------------------------------------

Expand Down Expand Up @@ -1040,10 +1056,6 @@ trait Scanners extends ScannersCommon {
getInterpolatedIdentRest()
} else if (atSupplementary(ch, isUnicodeIdentifierStart)) {
finishStringPart()
putChar(ch)
nextRawChar()
putChar(ch)
nextRawChar()
getInterpolatedIdentRest()
} else {
val expectations = "$$, $\", $identifier or ${expression}"
Expand Down Expand Up @@ -1370,23 +1382,6 @@ trait Scanners extends ScannersCommon {
if (detectedFloat) restOfNonIntegralNumber() else restOfNumber()
}

/** Parse character literal if current character is followed by \',
* or follow with given op and return a symbol literal token
*/
def charLitOr(op: () => Unit): Unit = {
putChar(ch)
nextChar()
if (ch == '\'') {
nextChar()
token = CHARLIT
setStrVal()
} else {
op()
token = SYMBOLLIT
strVal = name.toString
}
}

// Errors -----------------------------------------------------------------

/** generate an error at the given offset */
Expand Down
46 changes: 33 additions & 13 deletions src/reflect/scala/reflect/internal/Chars.scala
Expand Up @@ -15,10 +15,10 @@ package reflect
package internal

import scala.annotation.switch
import java.lang.{ Character => JCharacter }

/** Contains constants and classifier methods for characters */
trait Chars {
import Chars.CodePoint
// Be very careful touching these.
// Apparently trivial changes to the way you write these constants
// will cause Scanners.scala to go from a nice efficient switch to
Expand Down Expand Up @@ -72,28 +72,46 @@ trait Chars {
'0' <= c && c <= '9' || 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z'

/** Can character start an alphanumeric Scala identifier? */
def isIdentifierStart(c: Char): Boolean =
(c == '_') || (c == '$') || Character.isUnicodeIdentifierStart(c)
def isIdentifierStart(c: Char): Boolean = (c == '_') || (c == '$') || Character.isUnicodeIdentifierStart(c)
def isIdentifierStart(c: CodePoint): Boolean = (c == '_') || (c == '$') || Character.isUnicodeIdentifierStart(c)

/** Can character form part of an alphanumeric Scala identifier? */
def isIdentifierPart(c: Char) =
(c == '$') || Character.isUnicodeIdentifierPart(c)
def isIdentifierPart(c: Char) = (c == '$') || Character.isUnicodeIdentifierPart(c)

def isIdentifierPart(c: CodePoint) = (c == '$') || Character.isUnicodeIdentifierPart(c)

/** Is character a math or other symbol in Unicode? */
def isSpecial(c: Char) = {
val chtp = Character.getType(c)
chtp == Character.MATH_SYMBOL.toInt || chtp == Character.OTHER_SYMBOL.toInt
}

private final val otherLetters = Set[Char]('\u0024', '\u005F') // '$' and '_'
private final val letterGroups = {
import JCharacter._
Set[Byte](LOWERCASE_LETTER, UPPERCASE_LETTER, OTHER_LETTER, TITLECASE_LETTER, LETTER_NUMBER)
def isSpecial(codePoint: CodePoint) = {
val chtp = Character.getType(codePoint)
chtp == Character.MATH_SYMBOL.toInt || chtp == Character.OTHER_SYMBOL.toInt
}
def isScalaLetter(ch: Char) = letterGroups(JCharacter.getType(ch).toByte) || otherLetters(ch)

// used for precedence
import Character.{LOWERCASE_LETTER, UPPERCASE_LETTER, OTHER_LETTER, TITLECASE_LETTER, LETTER_NUMBER}
def isScalaLetter(c: Char): Boolean =
Character.getType(c) match {
case LOWERCASE_LETTER | UPPERCASE_LETTER | OTHER_LETTER | TITLECASE_LETTER | LETTER_NUMBER => true
case _ => c == '$' || c == '_'
}
def isScalaLetter(c: CodePoint): Boolean =
Character.getType(c) match {
case LOWERCASE_LETTER | UPPERCASE_LETTER | OTHER_LETTER | TITLECASE_LETTER | LETTER_NUMBER => true
case _ => c == '$' || c == '_'
}

/** Can character form part of a Scala operator name? */
def isOperatorPart(c : Char) : Boolean = (c: @switch) match {
def isOperatorPart(c: Char): Boolean = (c: @switch) match {
case '~' | '!' | '@' | '#' | '%' |
'^' | '*' | '+' | '-' | '<' |
'>' | '?' | ':' | '=' | '&' |
'|' | '/' | '\\' => true
case c => isSpecial(c)
}
def isOperatorPart(c: CodePoint): Boolean = (c: @switch) match {
case '~' | '!' | '@' | '#' | '%' |
'^' | '*' | '+' | '-' | '<' |
'>' | '?' | ':' | '=' | '&' |
Expand All @@ -102,4 +120,6 @@ trait Chars {
}
}

object Chars extends Chars { }
object Chars extends Chars {
type CodePoint = Int
}
17 changes: 7 additions & 10 deletions src/reflect/scala/reflect/internal/Precedence.scala
Expand Up @@ -10,26 +10,23 @@
* additional information regarding copyright ownership.
*/

package scala
package reflect
package internal
package scala.reflect.internal

import scala.annotation.switch
import Chars._
import Chars.{CodePoint, isOperatorPart, isScalaLetter}

final class Precedence private (val level: Int) extends AnyVal with Ordered[Precedence] {
def compare(that: Precedence): Int = level compare that.level
def compare(that: Precedence): Int = level.compare(that.level)
override def toString = s"Precedence($level)"
}


object Precedence extends (Int => Precedence) {
private[this] val ErrorName = "<error>"
private def isAssignmentOp(name: String) = name match {
case "!=" | "<=" | ">=" | "" => false
case _ => name.last == '=' && name.head != '=' && isOperatorPart(name.head)
case _ => name.last == '=' && name.head != '=' && isOperatorPart(name.codePointAt(0))
}
private def firstChar(ch: Char): Precedence = apply((ch: @switch) match {
private def firstChar(c: CodePoint): Precedence = apply((c: @switch) match {
case '|' => 2
case '^' => 3
case '&' => 4
Expand All @@ -38,13 +35,13 @@ object Precedence extends (Int => Precedence) {
case ':' => 7
case '+' | '-' => 8
case '*' | '/' | '%' => 9
case _ => if (isScalaLetter(ch)) 1 else 10
case _ => if (isScalaLetter(c)) 1 else 10
})

def apply(level: Int): Precedence = new Precedence(level)
def apply(name: String): Precedence = name match {
case "" | ErrorName => this(-1)
case _ if isAssignmentOp(name) => this(0)
case _ => firstChar(name charAt 0)
case _ => firstChar(name.codePointAt(0))
}
}