Skip to content

Commit

Permalink
Merge pull request #9687 from som-snytt/issue/1406
Browse files Browse the repository at this point in the history
Accept supplementary Unicode characters in source code
  • Loading branch information
SethTisue committed Aug 2, 2021
2 parents 3081265 + b124a54 commit 3f06ac7
Show file tree
Hide file tree
Showing 19 changed files with 287 additions and 107 deletions.
1 change: 1 addition & 0 deletions build.sbt
Expand Up @@ -723,6 +723,7 @@ lazy val junit = project.in(file("test") / "junit")
"-feature",
"-Xlint:-valpattern,_",
"-Wconf:msg=match may not be exhaustive:s", // if we missed a case, all that happens is the test fails
"-Wconf:cat=lint-nullary-unit&site=.*Test:s", // normal unit test style
"-Ypatmat-exhaust-depth", "40", // despite not caring about patmat exhaustiveness, we still get warnings for this
),
Compile / javacOptions ++= Seq("-Xlint"),
Expand Down
14 changes: 6 additions & 8 deletions spec/01-lexical-syntax.md
Expand Up @@ -6,13 +6,11 @@ chapter: 1

# Lexical Syntax

Scala programs are written using the Unicode Basic Multilingual Plane
(_BMP_) character set; Unicode supplementary characters are not
presently supported. This chapter defines the two modes of Scala's
lexical syntax, the Scala mode, and the _XML mode_. If not
otherwise mentioned, the following descriptions of Scala tokens refer
to _Scala mode_, and literal characters ‘c’ refer to the ASCII fragment
`\u0000``\u007F`.
Scala source code consists of Unicode text.

The program text is tokenized as described in this chapter.
See the last section for special support for XML literals,
which are parsed in _XML mode_.

To construct tokens, characters are distinguished according to the following
classes (Unicode general category given in parentheses):
Expand Down Expand Up @@ -74,7 +72,7 @@ or `_`, and _constant identifiers_, which do not.
For this purpose, lower case letters include not only a-z,
but also all characters in Unicode category Ll (lowercase letter),
as well as all letters that have contributory property
Other_Lowercase, except characters in category Nl (letter numerals)
Other_Lowercase, except characters in category Nl (letter numerals),
which are never taken as lower case.

The following are examples of variable identifiers:
Expand Down
4 changes: 2 additions & 2 deletions spec/06-expressions.md
Expand Up @@ -659,7 +659,7 @@ character. Characters are listed below in increasing order of
precedence, with characters on the same line having the same precedence.

```scala
(all letters)
(all letters, as defined in [chapter 1](01-lexical-syntax.html), including `_` and `$`)
|
^
&
Expand All @@ -668,7 +668,7 @@ precedence, with characters on the same line having the same precedence.
:
+ -
* / %
(all other special characters)
(other operator characters, as defined in [chapter 1](01-lexical-syntax.html), including Unicode categories `Sm` and `So`)
```

That is, operators starting with a letter have lowest precedence,
Expand Down
2 changes: 1 addition & 1 deletion src/compiler/scala/tools/nsc/ast/parser/Parsers.scala
Expand Up @@ -264,7 +264,7 @@ self =>
if (syntaxErrors.isEmpty) firstTry
else in.healBraces() match {
case Nil => showSyntaxErrors() ; firstTry
case patches => (this withPatches patches).parse()
case patches => withPatches(patches).parse()
}
}
}
Expand Down
199 changes: 132 additions & 67 deletions src/compiler/scala/tools/nsc/ast/parser/Scanners.scala
Expand Up @@ -172,7 +172,45 @@ trait Scanners extends ScannersCommon {
/** A switch whether operators at the start of lines can be infix operators. */
private var allowLeadingInfixOperators = true

private def isDigit(c: Char) = java.lang.Character isDigit c
private def isDigit(c: Char) = Character.isDigit(c)

import Character.{isHighSurrogate, isLowSurrogate, isUnicodeIdentifierPart, isUnicodeIdentifierStart, isValidCodePoint, toCodePoint}

// given char (ch) is high surrogate followed by low, codepoint passes predicate.
// true means supplementary chars were put to buffer.
// strict to require low surrogate (if not in string literal).
private def isSupplementary(high: Char, test: Int => Boolean, strict: Boolean = true): Boolean =
isHighSurrogate(high) && {
var res = false
nextChar()
val low = ch
if (isLowSurrogate(low)) {
nextChar()
val codepoint = toCodePoint(high, low)
if (isValidCodePoint(codepoint) && test(codepoint)) {
putChar(high)
putChar(low)
res = true
} else
syntaxError(f"illegal character '\\u$high%04x\\u$low%04x'")
} else if (!strict) {
putChar(high)
res = true
} else
syntaxError(f"illegal character '\\u$high%04x' missing low surrogate")
res
}
private def atSupplementary(ch: Char, f: Int => Boolean): Boolean =
isHighSurrogate(ch) && {
val hi = ch
val r = lookaheadReader
r.nextRawChar()
val lo = r.ch
isLowSurrogate(lo) && {
val codepoint = toCodePoint(hi, lo)
isValidCodePoint(codepoint) && f(codepoint)
}
}

private var openComments = 0
final protected def putCommentChar(): Unit = { processCommentChar(); nextChar() }
Expand Down Expand Up @@ -705,14 +743,18 @@ trait Scanners extends ScannersCommon {
syntaxError("empty character literal (use '\\'' for single quote)")
else {
nextChar()
token = CHARLIT
setStrVal()
if (cbuf.length != 1)
syntaxError("illegal codepoint in Char constant: " + cbuf.toString.map(c => f"\\u$c%04x").mkString("'", "", "'"))
else {
token = CHARLIT
setStrVal()
}
}
} else if (isEmptyCharLit) {
}
else if (isEmptyCharLit)
syntaxError("empty character literal")
} else {
else
unclosedCharLit()
}
}
else unclosedCharLit()
}
Expand Down Expand Up @@ -755,16 +797,18 @@ trait Scanners extends ScannersCommon {
} else if (ch == '\u2190') {
deprecationWarning("The unicode arrow `←` is deprecated, use `<-` instead. If you still wish to display it as one character, consider using a font with programming ligatures such as Fira Code.", "2.13.0")
nextChar(); token = LARROW
} else if (Character.isUnicodeIdentifierStart(ch)) {
} else if (isUnicodeIdentifierStart(ch)) {
putChar(ch)
nextChar()
getIdentRest()
} else if (isSpecial(ch)) {
putChar(ch)
nextChar()
getOperatorRest()
} else if (isSupplementary(ch, isUnicodeIdentifierStart)) {
getIdentRest()
} else {
syntaxError("illegal character '" + ("" + '\\' + 'u' + "%04x".format(ch.toInt)) + "'")
syntaxError(f"illegal character '\\u$ch%04x'")
nextChar()
}
}
Expand Down Expand Up @@ -831,13 +875,15 @@ trait Scanners extends ScannersCommon {
case SU => // strangely enough, Character.isUnicodeIdentifierPart(SU) returns true!
finishNamed()
case _ =>
if (Character.isUnicodeIdentifierPart(ch)) {
if (isUnicodeIdentifierPart(ch)) {
putChar(ch)
nextChar()
getIdentRest()
} else {
finishNamed()
}
else if (isSupplementary(ch, isUnicodeIdentifierPart))
getIdentRest()
else
finishNamed()
}

@tailrec
Expand Down Expand Up @@ -955,6 +1001,25 @@ trait Scanners extends ScannersCommon {
}
getStringPart(multiLine, seenEscapedQuote || q)
} else if (ch == '$') {
@tailrec def getInterpolatedIdentRest(): Unit =
if (ch != SU && isUnicodeIdentifierPart(ch)) {
putChar(ch)
nextRawChar()
getInterpolatedIdentRest()
} else if (atSupplementary(ch, isUnicodeIdentifierPart)) {
putChar(ch)
nextRawChar()
putChar(ch)
nextRawChar()
getInterpolatedIdentRest()
} else {
next.token = IDENTIFIER
next.name = newTermName(cbuf.toCharArray)
cbuf.clear()
val idx = next.name.start - kwOffset
if (idx >= 0 && idx < kwArray.length)
next.token = kwArray(idx)
}
nextRawChar()
if (ch == '$' || ch == '"') {
putChar(ch)
Expand All @@ -968,32 +1033,29 @@ trait Scanners extends ScannersCommon {
finishStringPart()
nextRawChar()
next.token = USCORE
} else if (Character.isUnicodeIdentifierStart(ch)) {
} else if (isUnicodeIdentifierStart(ch)) {
finishStringPart()
do {
putChar(ch)
nextRawChar()
} while (ch != SU && Character.isUnicodeIdentifierPart(ch))
next.token = IDENTIFIER
next.name = newTermName(cbuf.toString)
cbuf.clear()
val idx = next.name.start - kwOffset
if (idx >= 0 && idx < kwArray.length) {
next.token = kwArray(idx)
}
putChar(ch)
nextRawChar()
getInterpolatedIdentRest()
} else if (atSupplementary(ch, isUnicodeIdentifierStart)) {
finishStringPart()
putChar(ch)
nextRawChar()
putChar(ch)
nextRawChar()
getInterpolatedIdentRest()
} else {
val expectations = "$$, $\", $identifier or ${expression}"
syntaxError(s"invalid string interpolation $$$ch, expected: $expectations")
}
} else {
val isUnclosedLiteral = (ch == SU || (!multiLine && (ch == CR || ch == LF)))
if (isUnclosedLiteral) {
if (isUnclosedLiteral)
if (multiLine)
incompleteInputError("unclosed multi-line string literal")
else {
else
unclosedStringLit(seenEscapedQuote)
}
}
else {
putChar(ch)
nextRawChar()
Expand Down Expand Up @@ -1027,53 +1089,38 @@ trait Scanners extends ScannersCommon {
false
}

/** copy current character into cbuf, interpreting any escape sequences,
* and advance to next character.
/** Copy current character into cbuf, interpreting any escape sequences,
* and advance to next character. Surrogate pairs are consumed (see check
* at fetchSingleQuote), but orphan surrogate is allowed.
*/
protected def getLitChar(): Unit =
if (ch == '\\') {
nextChar()
if ('0' <= ch && ch <= '7') {
val start = charOffset - 2
val leadch: Char = ch
var oct: Int = digit2int(ch, 8)
nextChar()
if ('0' <= ch && ch <= '7') {
oct = oct * 8 + digit2int(ch, 8)
nextChar()
if (leadch <= '3' && '0' <= ch && ch <= '7') {
oct = oct * 8 + digit2int(ch, 8)
nextChar()
}
}
val alt = if (oct == LF) "\\n" else "\\u%04x" format oct
syntaxError(start, s"octal escape literals are unsupported: use $alt instead")
putChar(oct.toChar)
} else {
if (ch == 'u') {
if (getUEscape()) nextChar()
}
else {
ch match {
case 'b' => putChar('\b')
case 't' => putChar('\t')
case 'n' => putChar('\n')
case 'f' => putChar('\f')
case 'r' => putChar('\r')
case '\"' => putChar('\"')
case '\'' => putChar('\'')
case '\\' => putChar('\\')
case _ => invalidEscape()
}
nextChar()
}
}
} else {
charEscape()
} else if (!isSupplementary(ch, _ => true, strict = false)) {
putChar(ch)
nextChar()
}

private def getUEscape(): Boolean = {
private def charEscape(): Unit = {
var bump = true
ch match {
case 'b' => putChar('\b')
case 't' => putChar('\t')
case 'n' => putChar('\n')
case 'f' => putChar('\f')
case 'r' => putChar('\r')
case '\"' => putChar('\"')
case '\'' => putChar('\'')
case '\\' => putChar('\\')
case 'u' => bump = uEscape()
case x if '0' <= x && x <= '7' => bump = octalEscape()
case _ => invalidEscape()
}
if (bump) nextChar()
}

private def uEscape(): Boolean = {
while (ch == 'u') nextChar()
var codepoint = 0
var digitsRead = 0
Expand All @@ -1094,7 +1141,25 @@ trait Scanners extends ScannersCommon {
putChar(found)
true
}


private def octalEscape(): Boolean = {
val start = charOffset - 2
val leadch: Char = ch
var oct: Int = digit2int(ch, 8)
nextChar()
if ('0' <= ch && ch <= '7') {
oct = oct * 8 + digit2int(ch, 8)
nextChar()
if (leadch <= '3' && '0' <= ch && ch <= '7') {
oct = oct * 8 + digit2int(ch, 8)
nextChar()
}
}
val alt = if (oct == LF) "\\n" else f"\\u$oct%04x"
syntaxError(start, s"octal escape literals are unsupported: use $alt instead")
putChar(oct.toChar)
false
}

protected def invalidEscape(): Unit = {
syntaxError(charOffset - 1, "invalid escape character")
Expand Down
Expand Up @@ -27,9 +27,7 @@ import scala.tools.nsc.io.AbstractFile
*/
final class AbstractFileReader(val buf: Array[Byte]) extends DataReader {
@deprecated("Use other constructor", "2.13.0")
def this(file: AbstractFile) = {
this(file.toByteArray)
}
def this(file: AbstractFile) = this(file.toByteArray)

/** the current input pointer
*/
Expand Down Expand Up @@ -67,9 +65,8 @@ final class AbstractFileReader(val buf: Array[Byte]) extends DataReader {
def getByte(mybp: Int): Byte =
buf(mybp)

def getBytes(mybp: Int, bytes: Array[Byte]): Unit = {
def getBytes(mybp: Int, bytes: Array[Byte]): Unit =
System.arraycopy(buf, mybp, bytes, 0, bytes.length)
}

/** extract a character at position bp from buf
*/
Expand All @@ -95,9 +92,8 @@ final class AbstractFileReader(val buf: Array[Byte]) extends DataReader {
*/
def getDouble(mybp: Int): Double = longBitsToDouble(getLong(mybp))

def getUTF(mybp: Int, len: Int): String = {
def getUTF(mybp: Int, len: Int): String =
new DataInputStream(new ByteArrayInputStream(buf, mybp, len)).readUTF
}

/** skip next 'n' bytes
*/
Expand Down
1 change: 1 addition & 0 deletions src/partest/scala/tools/partest/DirectTest.scala
Expand Up @@ -45,6 +45,7 @@ abstract class DirectTest {
protected def pathOf(locations: String*) = locations.mkString(sys.props("path.separator"))

// override to add additional settings besides -d testOutput.path
// default is -usejavacp
def extraSettings: String = "-usejavacp"
// a default Settings object using only extraSettings
def settings: Settings = newSettings(CommandLineParser.tokenize(extraSettings))
Expand Down

0 comments on commit 3f06ac7

Please sign in to comment.