serde-rs · lucacasonato · Apr 12, 2022 · lucacasonato · Apr 12, 2022 · lucacasonato
diff --git a/src/de.rs b/src/de.rs
@@ -1570,7 +1570,10 @@ impl<'de, 'a, R: Read<'de>> de::Deserializer<'de> for &'a mut Deserializer<R> {
     ///
     /// The behavior of serde_json is specified to fail on non-UTF-8 strings
     /// when deserializing into Rust UTF-8 string types such as String, and
-    /// succeed with non-UTF-8 bytes when deserializing using this method.
+    /// succeed with the bytes representing the [WTF-8] encoding of code points
+    /// when deserializing using this method.
+    /// 
+    /// [WTF-8]: https://simonsapin.github.io/wtf-8
     ///
     /// Escape sequences are processed as usual, and for `\uXXXX` escapes it is
     /// still checked if the hex number represents a valid Unicode code point.

diff --git a/src/read.rs b/src/read.rs
@@ -861,20 +861,33 @@ fn parse_escape<'de, R: Read<'de>>(
         b'r' => scratch.push(b'\r'),
         b't' => scratch.push(b'\t'),
         b'u' => {
-            fn encode_surrogate(scratch: &mut Vec<u8>, n: u16) {
-                scratch.extend_from_slice(&[
-                    (n >> 12 & 0b0000_1111) as u8 | 0b1110_0000,
-                    (n >> 6 & 0b0011_1111) as u8 | 0b1000_0000,
-                    (n & 0b0011_1111) as u8 | 0b1000_0000,
-                ]);
+            fn encode_wtf8(scratch: &mut Vec<u8>, cp: u16) {
+                match cp {
+                    0x0000..=0x007F => {
+                        scratch.extend_from_slice(&[cp as u8]);
+                    }
+                    0x0080..=0x07FF => {
+                        scratch
+                            .extend_from_slice(&[0xC0 | (cp >> 6) as u8, 0x80 | (cp & 0x3F) as u8]);
+                    }
+                    0x0800..=0xFFFF => {
+                        scratch.extend_from_slice(&[
+                            0xE0 | (cp >> 12) as u8,
+                            0x80 | ((cp >> 6) & 0x3F) as u8,
+                            0x80 | (cp & 0x3F) as u8,
+                        ]);
+                    }
+                }
             }
 
             let c = match tri!(read.decode_hex_escape()) {
                 n @ 0xDC00..=0xDFFF => {
                     return if validate {
+                        // TODO: the error message is wrong, this is a lone
+                        // _trailing_ surrogate
                         error(read, ErrorCode::LoneLeadingSurrogateInHexEscape)
                     } else {
-                        encode_surrogate(scratch, n);
+                        encode_wtf8(scratch, n);
                         Ok(())
                     };
                 }
@@ -889,9 +902,9 @@ fn parse_escape<'de, R: Read<'de>>(
                     } else {
                         return if validate {
                             read.discard();
-                            error(read, ErrorCode::UnexpectedEndOfHexEscape)
+                            error(read, ErrorCode::LoneLeadingSurrogateInHexEscape)
                         } else {
-                            encode_surrogate(scratch, n1);
+                            encode_wtf8(scratch, n1);
                             Ok(())
                         };
                     }
@@ -903,7 +916,7 @@ fn parse_escape<'de, R: Read<'de>>(
                             read.discard();
                             error(read, ErrorCode::UnexpectedEndOfHexEscape)
                         } else {
-                            encode_surrogate(scratch, n1);
+                            encode_wtf8(scratch, n1);
                             // The \ prior to this byte started an escape sequence,
                             // so we need to parse that now. This recursive call
                             // does not blow the stack on malicious input because
@@ -916,7 +929,13 @@ fn parse_escape<'de, R: Read<'de>>(
                     let n2 = tri!(read.decode_hex_escape());
 
                     if n2 < 0xDC00 || n2 > 0xDFFF {
-                        return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape);
+                        return if validate {
+                            error(read, ErrorCode::LoneLeadingSurrogateInHexEscape)
+                        } else {
+                            encode_wtf8(scratch, n1);
+                            encode_wtf8(scratch, n2);
+                            Ok(())
+                        };
                     }
 
                     let n = (((n1 - 0xD800) as u32) << 10 | (n2 - 0xDC00) as u32) + 0x1_0000;

diff --git a/tests/test.rs b/tests/test.rs
@@ -1713,7 +1713,8 @@ fn test_byte_buf_de() {
 }
 
 #[test]
-fn test_byte_buf_de_lone_surrogate() {
+fn test_byte_buf_de_invalid_surrogates() {
+    // lone leading surrogate
     let bytes = ByteBuf::from(vec![237, 160, 188]);
     let v: ByteBuf = from_str(r#""\ud83c""#).unwrap();
     assert_eq!(v, bytes);
@@ -1726,23 +1727,49 @@ fn test_byte_buf_de_lone_surrogate() {
     let v: ByteBuf = from_str(r#""\ud83c ""#).unwrap();
     assert_eq!(v, bytes);
 
-    let bytes = ByteBuf::from(vec![237, 176, 129]);
-    let v: ByteBuf = from_str(r#""\udc01""#).unwrap();
-    assert_eq!(v, bytes);
-
     let res = from_str::<ByteBuf>(r#""\ud83c\!""#);
     assert!(res.is_err());
 
     let res = from_str::<ByteBuf>(r#""\ud83c\u""#);
     assert!(res.is_err());
 
-    let res = from_str::<ByteBuf>(r#""\ud83c\ud83c""#);
-    assert!(res.is_err());
+    // lone trailing surrogate
+    let bytes = ByteBuf::from(vec![237, 176, 129]);
+    let v: ByteBuf = from_str(r#""\udc01""#).unwrap();
+    assert_eq!(v, bytes);
+
+    // leading surrogate followed by other leading surrogate
+    let bytes = ByteBuf::from(vec![237, 160, 188, 237, 160, 188]);
+    let v: ByteBuf = from_str(r#""\ud83c\ud83c""#).unwrap();
+    assert_eq!(v, bytes);
+
+    // leading surrogate followed by "a" (U+0061) in \u encoding
+    let bytes = ByteBuf::from(vec![237, 160, 188, 97]);
+    let v: ByteBuf = from_str(r#""\ud83c\u0061""#).unwrap();
+    assert_eq!(v, bytes);
+
+    // leading surrogate followed by U+0080
+    let bytes = ByteBuf::from(vec![237, 160, 188, 194, 128]);
+    let v: ByteBuf = from_str(r#""\ud83c\u0080""#).unwrap();
+    assert_eq!(v, bytes);
+
+    // leading surrogate followed by U+FFFF
+    let bytes = ByteBuf::from(vec![237, 160, 188, 239, 191, 191]);
+    let v: ByteBuf = from_str(r#""\ud83c\uffff""#).unwrap();
+    assert_eq!(v, bytes);
+}
+
+#[test]
+fn test_byte_buf_de_surrogate_pair() {
+    // leading surrogate followed by trailing surrogate
+    let bytes = ByteBuf::from(vec![240, 159, 128, 128]);
+    let v: ByteBuf = from_str(r#""\ud83c\udc00""#).unwrap();
+    assert_eq!(v, bytes);
 }
 
 #[cfg(feature = "raw_value")]
 #[test]
-fn test_raw_de_lone_surrogate() {
+fn test_raw_de_invalid_surrogates() {
     use serde_json::value::RawValue;
 
     assert!(from_str::<Box<RawValue>>(r#""\ud83c""#).is_ok());
@@ -1752,6 +1779,17 @@ fn test_raw_de_lone_surrogate() {
     assert!(from_str::<Box<RawValue>>(r#""\udc01\!""#).is_err());
     assert!(from_str::<Box<RawValue>>(r#""\udc01\u""#).is_err());
     assert!(from_str::<Box<RawValue>>(r#""\ud83c\ud83c""#).is_ok());
+    assert!(from_str::<Box<RawValue>>(r#""\ud83c\u0061""#).is_ok());
+    assert!(from_str::<Box<RawValue>>(r#""\ud83c\u0080""#).is_ok());
+    assert!(from_str::<Box<RawValue>>(r#""\ud83c\uffff""#).is_ok());
+}
+
+#[cfg(feature = "raw_value")]
+#[test]
+fn test_raw_de_surrogate_pair() {
+    use serde_json::value::RawValue;
+
+    assert!(from_str::<Box<RawValue>>(r#""\ud83c\udc00""#).is_ok());
 }
 
 #[test]