Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added support of UTF-16 surrogate pairs to okio streams #2033

Merged
merged 3 commits into from Sep 9, 2022
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Expand Up @@ -46,12 +46,54 @@ internal class JsonToOkioStreamWriter(private val target: BufferedSink) : JsonWr
}
}

// Max value for a code point placed in one Char
private const val SINGLE_CHAR_MAX_CODEPOINT = Char.MAX_VALUE.code
// Value added to the high UTF-16 surrogate after shifting
private const val HIGH_SURROGATE_HEADER = 0xd800 - (0x010000 ushr 10)
// Value added to the low UTF-16 surrogate after masking
private const val LOW_SURROGATE_HEADER = 0xdc00


internal class OkioSerialReader(private val source: BufferedSource): SerialReader {
/*
A sequence of code points is read from UTF-8, some of it can take 2 characters.
In case the last code point requires 2 characters, and the array is already full, we buffer the second character
*/
private var bufferedChar: Char? = null

override fun read(buffer: CharArray, bufferOffset: Int, count: Int): Int {
var i = 0
while (i < count && !source.exhausted()) {
buffer[bufferOffset + i] = source.readUtf8CodePoint().toChar()

if (bufferedChar != null) {
buffer[bufferOffset + i] = bufferedChar!!
i++
bufferedChar = null
}

while (i < count && !source.exhausted()) {
val codePoint = source.readUtf8CodePoint()
if (codePoint <= SINGLE_CHAR_MAX_CODEPOINT) {
buffer[bufferOffset + i] = codePoint.toChar()
i++
} else {
// an example of working with surrogates is taken from okio library with minor changes, see https://github.com/square/okio
// UTF-16 high surrogate: 110110xxxxxxxxxx (10 bits)
// UTF-16 low surrogate: 110111yyyyyyyyyy (10 bits)
// Unicode code point: 00010000000000000000 + xxxxxxxxxxyyyyyyyyyy (21 bits)
val upChar = ((codePoint ushr 10) + HIGH_SURROGATE_HEADER).toChar()
val lowChar = ((codePoint and 0x03ff) + LOW_SURROGATE_HEADER).toChar()

buffer[bufferOffset + i] = upChar
i++

if (i < count) {
buffer[bufferOffset + i] = lowChar
i++
} else {
// if char array is full - buffer lower surrogate
bufferedChar = lowChar
}
}
}
return if (i > 0) i else -1
}
Expand Down
@@ -0,0 +1,22 @@
/*
* Copyright 2017-2022 JetBrains s.r.o. Use of this source code is governed by the Apache 2.0 license.
*/

package kotlinx.serialization.features

import kotlinx.serialization.builtins.serializer
import kotlinx.serialization.json.JsonTestBase
import kotlin.test.Test


class EmojiTest : JsonTestBase() {

@Test
fun testEmojiString() {
assertJsonFormAndRestored(
String.serializer(),
"\uD83C\uDF34",
"\"\uD83C\uDF34\""
)
}
}