diff --git a/src/de.rs b/src/de.rs index ffd0d48c2..2b6daf46f 100644 --- a/src/de.rs +++ b/src/de.rs @@ -1570,7 +1570,10 @@ impl<'de, 'a, R: Read<'de>> de::Deserializer<'de> for &'a mut Deserializer { /// /// The behavior of serde_json is specified to fail on non-UTF-8 strings /// when deserializing into Rust UTF-8 string types such as String, and - /// succeed with non-UTF-8 bytes when deserializing using this method. + /// succeed with the bytes representing the [WTF-8] encoding of code points + /// when deserializing using this method. + /// + /// [WTF-8]: https://simonsapin.github.io/wtf-8 /// /// Escape sequences are processed as usual, and for `\uXXXX` escapes it is /// still checked if the hex number represents a valid Unicode code point. diff --git a/src/read.rs b/src/read.rs index 1319d89c9..f3774e563 100644 --- a/src/read.rs +++ b/src/read.rs @@ -861,20 +861,33 @@ fn parse_escape<'de, R: Read<'de>>( b'r' => scratch.push(b'\r'), b't' => scratch.push(b'\t'), b'u' => { - fn encode_surrogate(scratch: &mut Vec, n: u16) { - scratch.extend_from_slice(&[ - (n >> 12 & 0b0000_1111) as u8 | 0b1110_0000, - (n >> 6 & 0b0011_1111) as u8 | 0b1000_0000, - (n & 0b0011_1111) as u8 | 0b1000_0000, - ]); + fn encode_wtf8(scratch: &mut Vec, cp: u16) { + match cp { + 0x0000..=0x007F => { + scratch.extend_from_slice(&[cp as u8]); + } + 0x0080..=0x07FF => { + scratch + .extend_from_slice(&[0xC0 | (cp >> 6) as u8, 0x80 | (cp & 0x3F) as u8]); + } + 0x0800..=0xFFFF => { + scratch.extend_from_slice(&[ + 0xE0 | (cp >> 12) as u8, + 0x80 | ((cp >> 6) & 0x3F) as u8, + 0x80 | (cp & 0x3F) as u8, + ]); + } + } } let c = match tri!(read.decode_hex_escape()) { n @ 0xDC00..=0xDFFF => { return if validate { + // TODO: the error message is wrong, this is a lone + // _trailing_ surrogate error(read, ErrorCode::LoneLeadingSurrogateInHexEscape) } else { - encode_surrogate(scratch, n); + encode_wtf8(scratch, n); Ok(()) }; } @@ -889,9 +902,9 @@ fn parse_escape<'de, R: Read<'de>>( } else { return if validate { read.discard(); - error(read, ErrorCode::UnexpectedEndOfHexEscape) + error(read, ErrorCode::LoneLeadingSurrogateInHexEscape) } else { - encode_surrogate(scratch, n1); + encode_wtf8(scratch, n1); Ok(()) }; } @@ -903,7 +916,7 @@ fn parse_escape<'de, R: Read<'de>>( read.discard(); error(read, ErrorCode::UnexpectedEndOfHexEscape) } else { - encode_surrogate(scratch, n1); + encode_wtf8(scratch, n1); // The \ prior to this byte started an escape sequence, // so we need to parse that now. This recursive call // does not blow the stack on malicious input because @@ -916,7 +929,13 @@ fn parse_escape<'de, R: Read<'de>>( let n2 = tri!(read.decode_hex_escape()); if n2 < 0xDC00 || n2 > 0xDFFF { - return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape); + return if validate { + error(read, ErrorCode::LoneLeadingSurrogateInHexEscape) + } else { + encode_wtf8(scratch, n1); + encode_wtf8(scratch, n2); + Ok(()) + }; } let n = (((n1 - 0xD800) as u32) << 10 | (n2 - 0xDC00) as u32) + 0x1_0000; diff --git a/tests/test.rs b/tests/test.rs index b11635e75..7c5d5a78e 100644 --- a/tests/test.rs +++ b/tests/test.rs @@ -1713,7 +1713,8 @@ fn test_byte_buf_de() { } #[test] -fn test_byte_buf_de_lone_surrogate() { +fn test_byte_buf_de_invalid_surrogates() { + // lone leading surrogate let bytes = ByteBuf::from(vec![237, 160, 188]); let v: ByteBuf = from_str(r#""\ud83c""#).unwrap(); assert_eq!(v, bytes); @@ -1726,23 +1727,49 @@ fn test_byte_buf_de_lone_surrogate() { let v: ByteBuf = from_str(r#""\ud83c ""#).unwrap(); assert_eq!(v, bytes); - let bytes = ByteBuf::from(vec![237, 176, 129]); - let v: ByteBuf = from_str(r#""\udc01""#).unwrap(); - assert_eq!(v, bytes); - let res = from_str::(r#""\ud83c\!""#); assert!(res.is_err()); let res = from_str::(r#""\ud83c\u""#); assert!(res.is_err()); - let res = from_str::(r#""\ud83c\ud83c""#); - assert!(res.is_err()); + // lone trailing surrogate + let bytes = ByteBuf::from(vec![237, 176, 129]); + let v: ByteBuf = from_str(r#""\udc01""#).unwrap(); + assert_eq!(v, bytes); + + // leading surrogate followed by other leading surrogate + let bytes = ByteBuf::from(vec![237, 160, 188, 237, 160, 188]); + let v: ByteBuf = from_str(r#""\ud83c\ud83c""#).unwrap(); + assert_eq!(v, bytes); + + // leading surrogate followed by "a" (U+0061) in \u encoding + let bytes = ByteBuf::from(vec![237, 160, 188, 97]); + let v: ByteBuf = from_str(r#""\ud83c\u0061""#).unwrap(); + assert_eq!(v, bytes); + + // leading surrogate followed by U+0080 + let bytes = ByteBuf::from(vec![237, 160, 188, 194, 128]); + let v: ByteBuf = from_str(r#""\ud83c\u0080""#).unwrap(); + assert_eq!(v, bytes); + + // leading surrogate followed by U+FFFF + let bytes = ByteBuf::from(vec![237, 160, 188, 239, 191, 191]); + let v: ByteBuf = from_str(r#""\ud83c\uffff""#).unwrap(); + assert_eq!(v, bytes); +} + +#[test] +fn test_byte_buf_de_surrogate_pair() { + // leading surrogate followed by trailing surrogate + let bytes = ByteBuf::from(vec![240, 159, 128, 128]); + let v: ByteBuf = from_str(r#""\ud83c\udc00""#).unwrap(); + assert_eq!(v, bytes); } #[cfg(feature = "raw_value")] #[test] -fn test_raw_de_lone_surrogate() { +fn test_raw_de_invalid_surrogates() { use serde_json::value::RawValue; assert!(from_str::>(r#""\ud83c""#).is_ok()); @@ -1752,6 +1779,17 @@ fn test_raw_de_lone_surrogate() { assert!(from_str::>(r#""\udc01\!""#).is_err()); assert!(from_str::>(r#""\udc01\u""#).is_err()); assert!(from_str::>(r#""\ud83c\ud83c""#).is_ok()); + assert!(from_str::>(r#""\ud83c\u0061""#).is_ok()); + assert!(from_str::>(r#""\ud83c\u0080""#).is_ok()); + assert!(from_str::>(r#""\ud83c\uffff""#).is_ok()); +} + +#[cfg(feature = "raw_value")] +#[test] +fn test_raw_de_surrogate_pair() { + use serde_json::value::RawValue; + + assert!(from_str::>(r#""\ud83c\udc00""#).is_ok()); } #[test]