Skip to content

Commit

Permalink
Added support of UTF-16 surrogate pairs to okio streams
Browse files Browse the repository at this point in the history
Fixes #2030
  • Loading branch information
shanshin committed Sep 9, 2022
1 parent 2fe2efa commit 79de734
Show file tree
Hide file tree
Showing 3 changed files with 67 additions and 3 deletions.
Expand Up @@ -46,12 +46,54 @@ internal class JsonToOkioStreamWriter(private val target: BufferedSink) : JsonWr
}
}

// Max value for a code point placed in one Char
private const val SINGLE_CHAR_MAX_CODEPOINT = Char.MAX_VALUE.code
// Value added to the high UTF-16 surrogate after shifting
private const val HIGH_SURROGATE_HEADER = 0xd800 - (0x010000 ushr 10)
// Value added to the low UTF-16 surrogate after masking
private const val LOW_SURROGATE_HEADER = 0xdc00


internal class OkioSerialReader(private val source: BufferedSource): SerialReader {
/*
A sequence of code points is read from UTF-8, some of it can take 2 characters.
In case the last code point requires 2 characters, and the array is already full, we buffer the second character
*/
private var bufferedChar: Char? = null

override fun read(buffer: CharArray, bufferOffset: Int, count: Int): Int {
var i = 0
while (i < count && !source.exhausted()) {
buffer[bufferOffset + i] = source.readUtf8CodePoint().toChar()

if (bufferedChar != null) {
buffer[bufferOffset + i] = bufferedChar!!
i++
bufferedChar = null
}

while (i < count && !source.exhausted()) {
val codePoint = source.readUtf8CodePoint()
if (codePoint <= SINGLE_CHAR_MAX_CODEPOINT) {
buffer[bufferOffset + i] = codePoint.toChar()
i++
} else {
// an example of working with surrogates is taken from okio library with minor changes, see https://github.com/square/okio
// UTF-16 high surrogate: 110110xxxxxxxxxx (10 bits)
// UTF-16 low surrogate: 110111yyyyyyyyyy (10 bits)
// Unicode code point: 00010000000000000000 + xxxxxxxxxxyyyyyyyyyy (21 bits)
val upChar = ((codePoint ushr 10) + HIGH_SURROGATE_HEADER).toChar()
val lowChar = ((codePoint and 0x03ff) + LOW_SURROGATE_HEADER).toChar()

buffer[bufferOffset + i] = upChar
i++

if (i < count) {
buffer[bufferOffset + i] = lowChar
i++
} else {
// if char array is full - buffer lower surrogate
bufferedChar = lowChar
}
}
}
return if (i > 0) i else -1
}
Expand Down
@@ -0,0 +1,22 @@
/*
* Copyright 2017-2022 JetBrains s.r.o. Use of this source code is governed by the Apache 2.0 license.
*/

package kotlinx.serialization.features

import kotlinx.serialization.builtins.serializer
import kotlinx.serialization.json.JsonTestBase
import kotlin.test.Test


class EmojiTest : JsonTestBase() {

@Test
fun testEmojiString() {
assertJsonFormAndRestored(
String.serializer(),
"\uD83C\uDF34",
"\"\uD83C\uDF34\""
)
}
}
Expand Up @@ -11,7 +11,7 @@ actual fun <T> Json.encodeViaStream(
): String {
val output = ByteArrayOutputStream()
encodeToStream(serializer, value, output)
return output.toString()
return output.toString(Charsets.UTF_8.name())
}

actual fun <T> Json.decodeViaStream(
Expand Down

0 comments on commit 79de734

Please sign in to comment.