From 9d252e68e69aa5558c2fadcd1851b02b5e482458 Mon Sep 17 00:00:00 2001 From: "Sergey.Shanshin" Date: Thu, 8 Sep 2022 12:25:13 +0200 Subject: [PATCH 1/3] Added support of UTF-18 surrogate pairs to okio streams Fixes #2030 --- .../json/okio/internal/OkioJsonStreams.kt | 46 ++++++++++++++++++- .../serialization/features/EmojiTest.kt | 22 +++++++++ 2 files changed, 66 insertions(+), 2 deletions(-) create mode 100644 formats/json-tests/commonTest/src/kotlinx/serialization/features/EmojiTest.kt diff --git a/formats/json-okio/commonMain/src/kotlinx/serialization/json/okio/internal/OkioJsonStreams.kt b/formats/json-okio/commonMain/src/kotlinx/serialization/json/okio/internal/OkioJsonStreams.kt index ae8de4719..6d3c6c6dd 100644 --- a/formats/json-okio/commonMain/src/kotlinx/serialization/json/okio/internal/OkioJsonStreams.kt +++ b/formats/json-okio/commonMain/src/kotlinx/serialization/json/okio/internal/OkioJsonStreams.kt @@ -46,12 +46,54 @@ internal class JsonToOkioStreamWriter(private val target: BufferedSink) : JsonWr } } +// Max value for a code point placed in one Char +private const val SINGLE_CHAR_MAX_CODEPOINT = Char.MAX_VALUE.code +// Value added to the high UTF-16 surrogate after shifting +private const val HIGH_SURROGATE_HEADER = 0xd800 - (0x010000 ushr 10) +// Value added to the low UTF-16 surrogate after masking +private const val LOW_SURROGATE_HEADER = 0xdc00 + + internal class OkioSerialReader(private val source: BufferedSource): SerialReader { + /* + A sequence of code points is read from UTF-8, some of it can take 2 characters. + In case the last code point requires 2 characters, and the array is already full, we buffer the second character + */ + private var bufferedChar: Char? = null + override fun read(buffer: CharArray, bufferOffset: Int, count: Int): Int { var i = 0 - while (i < count && !source.exhausted()) { - buffer[bufferOffset + i] = source.readUtf8CodePoint().toChar() + + if (bufferedChar != null) { + buffer[bufferOffset + i] = bufferedChar!! i++ + bufferedChar = null + } + + while (i < count && !source.exhausted()) { + val codePoint = source.readUtf8CodePoint() + if (codePoint <= SINGLE_CHAR_MAX_CODEPOINT) { + buffer[bufferOffset + i] = codePoint.toChar() + i++ + } else { + // an example of working with surrogates is taken from okio library with minor changes, see https://github.com/square/okio + // UTF-16 high surrogate: 110110xxxxxxxxxx (10 bits) + // UTF-16 low surrogate: 110111yyyyyyyyyy (10 bits) + // Unicode code point: 00010000000000000000 + xxxxxxxxxxyyyyyyyyyy (21 bits) + val upChar = ((codePoint ushr 10) + HIGH_SURROGATE_HEADER).toChar() + val lowChar = ((codePoint and 0x03ff) + LOW_SURROGATE_HEADER).toChar() + + buffer[bufferOffset + i] = upChar + i++ + + if (i < count) { + buffer[bufferOffset + i] = lowChar + i++ + } else { + // if char array is full - buffer lower surrogate + bufferedChar = lowChar + } + } } return if (i > 0) i else -1 } diff --git a/formats/json-tests/commonTest/src/kotlinx/serialization/features/EmojiTest.kt b/formats/json-tests/commonTest/src/kotlinx/serialization/features/EmojiTest.kt new file mode 100644 index 000000000..5765300d5 --- /dev/null +++ b/formats/json-tests/commonTest/src/kotlinx/serialization/features/EmojiTest.kt @@ -0,0 +1,22 @@ +/* + * Copyright 2017-2022 JetBrains s.r.o. Use of this source code is governed by the Apache 2.0 license. + */ + +package kotlinx.serialization.features + +import kotlinx.serialization.builtins.serializer +import kotlinx.serialization.json.JsonTestBase +import kotlin.test.Test + + +class EmojiTest : JsonTestBase() { + + @Test + fun testEmojiString() { + assertJsonFormAndRestored( + String.serializer(), + "\uD83C\uDF34", + "\"🌴\"" + ) + } +} From 5a78fb6a164a0e38a779cb188f247787f7e45938 Mon Sep 17 00:00:00 2001 From: "Sergey.Shanshin" Date: Thu, 8 Sep 2022 14:12:24 +0200 Subject: [PATCH 2/3] ~fix emoji --- .../commonTest/src/kotlinx/serialization/features/EmojiTest.kt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/formats/json-tests/commonTest/src/kotlinx/serialization/features/EmojiTest.kt b/formats/json-tests/commonTest/src/kotlinx/serialization/features/EmojiTest.kt index 5765300d5..1e3904ab2 100644 --- a/formats/json-tests/commonTest/src/kotlinx/serialization/features/EmojiTest.kt +++ b/formats/json-tests/commonTest/src/kotlinx/serialization/features/EmojiTest.kt @@ -16,7 +16,7 @@ class EmojiTest : JsonTestBase() { assertJsonFormAndRestored( String.serializer(), "\uD83C\uDF34", - "\"🌴\"" + "\"\uD83C\uDF34\"" ) } } From 13102ac7c0f8689552877794477625e4bcd579ba Mon Sep 17 00:00:00 2001 From: "Sergey.Shanshin" Date: Fri, 9 Sep 2022 12:20:27 +0200 Subject: [PATCH 3/3] ~fix emoji test for Windows --- .../jvmTest/src/kotlinx/serialization/test/JsonHelpers.kt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/formats/json-tests/jvmTest/src/kotlinx/serialization/test/JsonHelpers.kt b/formats/json-tests/jvmTest/src/kotlinx/serialization/test/JsonHelpers.kt index ebb49c356..9220bbd32 100644 --- a/formats/json-tests/jvmTest/src/kotlinx/serialization/test/JsonHelpers.kt +++ b/formats/json-tests/jvmTest/src/kotlinx/serialization/test/JsonHelpers.kt @@ -11,7 +11,7 @@ actual fun Json.encodeViaStream( ): String { val output = ByteArrayOutputStream() encodeToStream(serializer, value, output) - return output.toString() + return output.toString(Charsets.UTF_8.name()) } actual fun Json.decodeViaStream(