From 849c6845b4c5b5076897bedeb9d0912e95e50517 Mon Sep 17 00:00:00 2001 From: Luca Casonato Date: Wed, 24 Nov 2021 14:50:33 +0100 Subject: [PATCH 1/3] Deserialize lone surrogates into byte bufs This commit deserializes lone surrogates in strings that are encoded in escape sequences instead of erroring on them. --- src/de.rs | 13 ++++++------ src/read.rs | 59 +++++++++++++++++++++++++++++++++++++++++++++------ tests/test.rs | 7 ++++++ 3 files changed, 66 insertions(+), 13 deletions(-) diff --git a/src/de.rs b/src/de.rs index a2f34b908..0766edf31 100644 --- a/src/de.rs +++ b/src/de.rs @@ -1580,20 +1580,21 @@ impl<'de, 'a, R: Read<'de>> de::Deserializer<'de> for &'a mut Deserializer { /// ``` /// /// Backslash escape sequences like `\n` are still interpreted and required - /// to be valid, and `\u` escape sequences are required to represent valid - /// Unicode code points. + /// to be valid. `\u` escape sequences are required to represent valid + /// Unicode code points, except in the case of lone surrogate pairs. /// /// ``` /// use serde_bytes::ByteBuf; /// /// fn look_at_bytes() { - /// let json_data = b"\"invalid unicode surrogate: \\uD801\""; + /// let json_data = b"\"lone surrogate pair: \\uD801\""; /// let parsed: Result = serde_json::from_slice(json_data); /// - /// assert!(parsed.is_err()); + /// assert!(parsed.is_ok()); /// - /// let expected_msg = "unexpected end of hex escape at line 1 column 35"; - /// assert_eq!(expected_msg, parsed.unwrap_err().to_string()); + /// let expected = b"lone surrogate pair: \xED\xA0\x81"; + /// let bytes: ByteBuf = parsed.unwrap(); + /// assert_eq!(expected, &bytes[..]); /// } /// # /// # look_at_bytes(); diff --git a/src/read.rs b/src/read.rs index 4e883c68b..329a6df3c 100644 --- a/src/read.rs +++ b/src/read.rs @@ -225,7 +225,7 @@ where return result(self, scratch); } b'\\' => { - tri!(parse_escape(self, scratch)); + tri!(parse_escape(self, validate, scratch)); } _ => { if validate { @@ -465,7 +465,7 @@ impl<'a> SliceRead<'a> { b'\\' => { scratch.extend_from_slice(&self.slice[start..self.index]); self.index += 1; - tri!(parse_escape(self, scratch)); + tri!(parse_escape(self, validate, scratch)); start = self.index; } _ => { @@ -817,6 +817,16 @@ where } } +fn peek_or_eof<'de, R>(read: &mut R) -> Result +where + R: ?Sized + Read<'de>, +{ + match tri!(read.peek()) { + Some(b) => Ok(b), + None => error(read, ErrorCode::EofWhileParsingString), + } +} + fn error<'de, R, T>(read: &R, reason: ErrorCode) -> Result where R: ?Sized + Read<'de>, @@ -831,7 +841,11 @@ fn as_str<'de, 's, R: Read<'de>>(read: &R, slice: &'s [u8]) -> Result<&'s str> { /// Parses a JSON escape sequence and appends it into the scratch space. Assumes /// the previous byte read was a backslash. -fn parse_escape<'de, R: Read<'de>>(read: &mut R, scratch: &mut Vec) -> Result<()> { +fn parse_escape<'de, R: Read<'de>>( + read: &mut R, + validate: bool, + scratch: &mut Vec, +) -> Result<()> { let ch = tri!(next_or_eof(read)); match ch { @@ -851,13 +865,44 @@ fn parse_escape<'de, R: Read<'de>>(read: &mut R, scratch: &mut Vec) -> Resul // Non-BMP characters are encoded as a sequence of // two hex escapes, representing UTF-16 surrogates. + // If `validate` is false and we only find a single + // hex escape that is a surrogate, then we'll accept + // it instead of erroring. n1 @ 0xD800..=0xDBFF => { - if tri!(next_or_eof(read)) != b'\\' { - return error(read, ErrorCode::UnexpectedEndOfHexEscape); + if tri!(peek_or_eof(read)) != b'\\' { + if validate { + tri!(next_or_eof(read)); + return error(read, ErrorCode::UnexpectedEndOfHexEscape); + } + + let utf8_bytes = [ + (n1 >> 12 & 0x0F) as u8 | 0b1110_0000, + (n1 >> 6 & 0x3F) as u8 | 0b1000_0000, + (n1 & 0x3F) as u8 | 0b1000_0000, + ]; + + scratch.extend_from_slice(&utf8_bytes); + + return Ok(()); } - if tri!(next_or_eof(read)) != b'u' { - return error(read, ErrorCode::UnexpectedEndOfHexEscape); + tri!(next_or_eof(read)); + if tri!(peek_or_eof(read)) != b'u' { + if validate { + tri!(next_or_eof(read)); + return error(read, ErrorCode::UnexpectedEndOfHexEscape); + } + + let utf8_bytes = [ + (n1 >> 12 & 0x0F) as u8 | 0b1110_0000, + (n1 >> 6 & 0x3F) as u8 | 0b1000_0000, + (n1 & 0x3F) as u8 | 0b1000_0000, + ]; + + scratch.extend_from_slice(&utf8_bytes); + + return Ok(()); } + tri!(next_or_eof(read)); let n2 = tri!(read.decode_hex_escape()); diff --git a/tests/test.rs b/tests/test.rs index 4b7540540..bf19532b0 100644 --- a/tests/test.rs +++ b/tests/test.rs @@ -1714,6 +1714,13 @@ fn test_byte_buf_de() { assert_eq!(v, bytes); } +#[test] +fn test_byte_buf_de_invalid_escape_sequence() { + let bytes = ByteBuf::from(vec![237, 160, 188]); + let v: ByteBuf = from_str(r#""\ud83c""#).unwrap(); + assert_eq!(v, bytes); +} + #[test] fn test_byte_buf_de_multiple() { let s: Vec = from_str(r#"["ab\nc", "cd\ne"]"#).unwrap(); From 4c28c5737b26f15bc7457d53eff8517b33cb4a43 Mon Sep 17 00:00:00 2001 From: Luca Casonato Date: Wed, 24 Nov 2021 23:29:17 +0100 Subject: [PATCH 2/3] fix wording --- src/de.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/de.rs b/src/de.rs index 0766edf31..d9a5fee8c 100644 --- a/src/de.rs +++ b/src/de.rs @@ -1581,18 +1581,18 @@ impl<'de, 'a, R: Read<'de>> de::Deserializer<'de> for &'a mut Deserializer { /// /// Backslash escape sequences like `\n` are still interpreted and required /// to be valid. `\u` escape sequences are required to represent valid - /// Unicode code points, except in the case of lone surrogate pairs. + /// Unicode code points, except in the case of lone surrogates. /// /// ``` /// use serde_bytes::ByteBuf; /// /// fn look_at_bytes() { - /// let json_data = b"\"lone surrogate pair: \\uD801\""; + /// let json_data = b"\"lone surrogate: \\uD801\""; /// let parsed: Result = serde_json::from_slice(json_data); /// /// assert!(parsed.is_ok()); /// - /// let expected = b"lone surrogate pair: \xED\xA0\x81"; + /// let expected = b"lone surrogate: \xED\xA0\x81"; /// let bytes: ByteBuf = parsed.unwrap(); /// assert_eq!(expected, &bytes[..]); /// } From 07c740c2ffa2e81c9c3dc7982b67572e990e2cdd Mon Sep 17 00:00:00 2001 From: Luca Casonato Date: Wed, 24 Nov 2021 23:48:42 +0100 Subject: [PATCH 3/3] fix parsing escape sequences after lone surrogates --- src/read.rs | 29 +++++++++++++++++++++++------ tests/test.rs | 23 ++++++++++++++++++++++- 2 files changed, 45 insertions(+), 7 deletions(-) diff --git a/src/read.rs b/src/read.rs index 329a6df3c..034cc6557 100644 --- a/src/read.rs +++ b/src/read.rs @@ -859,8 +859,20 @@ fn parse_escape<'de, R: Read<'de>>( b't' => scratch.push(b'\t'), b'u' => { let c = match tri!(read.decode_hex_escape()) { - 0xDC00..=0xDFFF => { - return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape); + n @ 0xDC00..=0xDFFF => { + if validate { + return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape); + } + + let utf8_bytes = [ + (n >> 12 & 0x0F) as u8 | 0b1110_0000, + (n >> 6 & 0x3F) as u8 | 0b1000_0000, + (n & 0x3F) as u8 | 0b1000_0000, + ]; + + scratch.extend_from_slice(&utf8_bytes); + + return Ok(()); } // Non-BMP characters are encoded as a sequence of @@ -871,7 +883,7 @@ fn parse_escape<'de, R: Read<'de>>( n1 @ 0xD800..=0xDBFF => { if tri!(peek_or_eof(read)) != b'\\' { if validate { - tri!(next_or_eof(read)); + read.discard(); return error(read, ErrorCode::UnexpectedEndOfHexEscape); } @@ -885,10 +897,11 @@ fn parse_escape<'de, R: Read<'de>>( return Ok(()); } - tri!(next_or_eof(read)); + read.discard(); + if tri!(peek_or_eof(read)) != b'u' { if validate { - tri!(next_or_eof(read)); + read.discard(); return error(read, ErrorCode::UnexpectedEndOfHexEscape); } @@ -900,9 +913,13 @@ fn parse_escape<'de, R: Read<'de>>( scratch.extend_from_slice(&utf8_bytes); + // The \ prior to this byte started an escape sequence, + // so we need to parse that now. + parse_escape(read, validate, scratch)?; + return Ok(()); } - tri!(next_or_eof(read)); + read.discard(); let n2 = tri!(read.decode_hex_escape()); diff --git a/tests/test.rs b/tests/test.rs index bf19532b0..636053004 100644 --- a/tests/test.rs +++ b/tests/test.rs @@ -1715,10 +1715,31 @@ fn test_byte_buf_de() { } #[test] -fn test_byte_buf_de_invalid_escape_sequence() { +fn test_byte_buf_de_lone_surrogate() { let bytes = ByteBuf::from(vec![237, 160, 188]); let v: ByteBuf = from_str(r#""\ud83c""#).unwrap(); assert_eq!(v, bytes); + + let bytes = ByteBuf::from(vec![237, 160, 188, 10]); + let v: ByteBuf = from_str(r#""\ud83c\n""#).unwrap(); + assert_eq!(v, bytes); + + let bytes = ByteBuf::from(vec![237, 160, 188, 32]); + let v: ByteBuf = from_str(r#""\ud83c ""#).unwrap(); + assert_eq!(v, bytes); + + let bytes = ByteBuf::from(vec![237, 176, 129]); + let v: ByteBuf = from_str(r#""\udc01""#).unwrap(); + assert_eq!(v, bytes); + + let res = from_str::(r#""\ud83c\!""#); + assert!(res.is_err()); + + let res = from_str::(r#""\ud83c\u""#); + assert!(res.is_err()); + + let res = from_str::(r#""\ud83c\ud83c""#); + assert!(res.is_err()); } #[test]