Skip to content

Commit

Permalink
Support for parsing parial JSON strings in Python (#66)
Browse files Browse the repository at this point in the history
  • Loading branch information
samuelcolvin committed Mar 25, 2024
1 parent 4dd0c35 commit 56db4fc
Show file tree
Hide file tree
Showing 3 changed files with 152 additions and 38 deletions.
5 changes: 3 additions & 2 deletions benches/python.rs
Expand Up @@ -15,6 +15,7 @@ fn python_parse_numeric(bench: &mut Bencher) {
br#" { "int": 1, "bigint": 123456789012345678901234567890, "float": 1.2} "#,
false,
true,
false,
)
.unwrap()
});
Expand All @@ -23,7 +24,7 @@ fn python_parse_numeric(bench: &mut Bencher) {

fn python_parse_other(bench: &mut Bencher) {
Python::with_gil(|py| {
bench.iter(|| python_parse(py, br#"["string", true, false, null]"#, false, true).unwrap());
bench.iter(|| python_parse(py, br#"["string", true, false, null]"#, false, true, false).unwrap());
})
}

Expand All @@ -34,7 +35,7 @@ fn _python_parse_file(path: &str, bench: &mut Bencher, cache_strings: bool) {
let json_data = contents.as_bytes();

Python::with_gil(|py| {
bench.iter(|| python_parse(py, json_data, false, cache_strings).unwrap());
bench.iter(|| python_parse(py, json_data, false, cache_strings, false).unwrap());
})
}

Expand Down
118 changes: 90 additions & 28 deletions src/python.rs
Expand Up @@ -13,6 +13,7 @@ use crate::errors::{json_err, json_error, JsonError, JsonResult, DEFAULT_RECURSI
use crate::number_decoder::{NumberAny, NumberInt};
use crate::parse::{Parser, Peek};
use crate::string_decoder::{StringDecoder, Tape};
use crate::JsonErrorType;

/// Parse a JSON value from a byte slice and return a Python object.
///
Expand All @@ -31,12 +32,14 @@ pub fn python_parse<'py>(
json_data: &[u8],
allow_inf_nan: bool,
cache_strings: bool,
allow_partial: bool,
) -> JsonResult<Bound<'py, PyAny>> {
let mut python_parser = PythonParser {
parser: Parser::new(json_data),
tape: Tape::default(),
recursion_limit: DEFAULT_RECURSION_LIMIT,
allow_inf_nan,
allow_partial,
};

let peek = python_parser.parser.peek()?;
Expand All @@ -45,7 +48,9 @@ pub fn python_parse<'py>(
} else {
python_parser.py_take_value::<StringNoCache>(py, peek)?
};
python_parser.parser.finish()?;
if !allow_partial {
python_parser.parser.finish()?;
}
Ok(v)
}

Expand All @@ -59,6 +64,7 @@ struct PythonParser<'j> {
tape: Tape,
recursion_limit: u8,
allow_inf_nan: bool,
allow_partial: bool,
}

impl<'j> PythonParser<'j> {
Expand All @@ -67,6 +73,21 @@ impl<'j> PythonParser<'j> {
py: Python<'py>,
peek: Peek,
) -> JsonResult<Bound<'py, PyAny>> {
macro_rules! tri {
($result:expr, $partial_value:expr) => {
match $result {
Ok(k) => k,
Err(e) => {
return if self._allow_partial_err(&e) {
Ok($partial_value.into_any())
} else {
Err(e)
}
}
}
};
}

match peek {
Peek::Null => {
self.parser.consume_null()?;
Expand All @@ -85,13 +106,12 @@ impl<'j> PythonParser<'j> {
Ok(StringCache::get(py, s.as_str()))
}
Peek::Array => {
let list = if let Some(peek_first) = self.parser.array_first()? {
let list = if let Some(peek_first) = tri!(self.parser.array_first(), PyList::empty_bound(py)) {
let mut vec: SmallVec<[Bound<'_, PyAny>; 8]> = SmallVec::with_capacity(8);
let v = self._check_take_value::<StringCache>(py, peek_first)?;
vec.push(v);
while let Some(peek) = self.parser.array_step()? {
let v = self._check_take_value::<StringCache>(py, peek)?;
vec.push(v);
if let Err(e) = self._parse_array::<StringCache>(py, peek_first, &mut vec) {
if !self._allow_partial_err(&e) {
return Err(e);
}
}
PyList::new_bound(py, vec)
} else {
Expand All @@ -101,27 +121,9 @@ impl<'j> PythonParser<'j> {
}
Peek::Object => {
let dict = PyDict::new_bound(py);

let set_item = |key: Bound<'py, PyAny>, value: Bound<'py, PyAny>| {
let r = unsafe { ffi::PyDict_SetItem(dict.as_ptr(), key.as_ptr(), value.as_ptr()) };
// AFAIK this shouldn't happen since the key will always be a string which is hashable
// we panic here rather than returning a result and using `?` below as it's up to 14% faster
// presumably because there are fewer branches
if r == -1 {
panic!("PyDict_SetItem failed")
}
};

if let Some(first_key) = self.parser.object_first::<StringDecoder>(&mut self.tape)? {
let first_key = StringCache::get(py, first_key.as_str());
let peek = self.parser.peek()?;
let first_value = self._check_take_value::<StringCache>(py, peek)?;
set_item(first_key, first_value);
while let Some(key) = self.parser.object_step::<StringDecoder>(&mut self.tape)? {
let key = StringCache::get(py, key.as_str());
let peek = self.parser.peek()?;
let value = self._check_take_value::<StringCache>(py, peek)?;
set_item(key, value);
if let Err(e) = self._parse_object::<StringCache>(py, &dict) {
if !self._allow_partial_err(&e) {
return Err(e);
}
}
Ok(dict.into_any())
Expand All @@ -146,6 +148,66 @@ impl<'j> PythonParser<'j> {
}
}

fn _parse_array<'py, StringCache: StringMaybeCache>(
&mut self,
py: Python<'py>,
peek_first: Peek,
vec: &mut SmallVec<[Bound<'py, PyAny>; 8]>,
) -> JsonResult<()> {
let v = self._check_take_value::<StringCache>(py, peek_first)?;
vec.push(v);
while let Some(peek) = self.parser.array_step()? {
let v = self._check_take_value::<StringCache>(py, peek)?;
vec.push(v);
}
Ok(())
}

fn _parse_object<'py, StringCache: StringMaybeCache>(
&mut self,
py: Python<'py>,
dict: &Bound<'py, PyDict>,
) -> JsonResult<()> {
let set_item = |key: Bound<'py, PyAny>, value: Bound<'py, PyAny>| {
let r = unsafe { ffi::PyDict_SetItem(dict.as_ptr(), key.as_ptr(), value.as_ptr()) };
// AFAIK this shouldn't happen since the key will always be a string which is hashable
// we panic here rather than returning a result and using `?` below as it's up to 14% faster
// presumably because there are fewer branches
if r == -1 {
panic!("PyDict_SetItem failed")
}
};
if let Some(first_key) = self.parser.object_first::<StringDecoder>(&mut self.tape)? {
let first_key = StringCache::get(py, first_key.as_str());
let peek = self.parser.peek()?;
let first_value = self._check_take_value::<StringCache>(py, peek)?;
set_item(first_key, first_value);
while let Some(key) = self.parser.object_step::<StringDecoder>(&mut self.tape)? {
let key = StringCache::get(py, key.as_str());
let peek = self.parser.peek()?;
let value = self._check_take_value::<StringCache>(py, peek)?;
set_item(key, value);
}
}
Ok(())
}

fn _allow_partial_err(&self, e: &JsonError) -> bool {
if self.allow_partial {
matches!(
e.error_type,
JsonErrorType::EofWhileParsingList
| JsonErrorType::EofWhileParsingObject
| JsonErrorType::EofWhileParsingString
| JsonErrorType::EofWhileParsingValue
| JsonErrorType::ExpectedListCommaOrEnd
| JsonErrorType::ExpectedObjectCommaOrEnd
)
} else {
false
}
}

fn _check_take_value<'py, StringCache: StringMaybeCache>(
&mut self,
py: Python<'py>,
Expand Down
67 changes: 59 additions & 8 deletions tests/python.rs
@@ -1,4 +1,5 @@
use pyo3::prelude::*;
use pyo3::types::{PyDict, PyList};
use pyo3::ToPyObject;

use jiter::{map_json_error, python_parse, JsonValue};
Expand Down Expand Up @@ -42,6 +43,7 @@ fn test_python_parse_numeric() {
br#" { "int": 1, "bigint": 123456789012345678901234567890, "float": 1.2} "#,
false,
true,
false,
)
.unwrap();
assert_eq!(
Expand All @@ -59,6 +61,7 @@ fn test_python_parse_other_cached() {
br#"["string", true, false, null, NaN, Infinity, -Infinity]"#,
true,
true,
false,
)
.unwrap();
assert_eq!(obj.to_string(), "['string', True, False, None, nan, inf, -inf]");
Expand All @@ -68,15 +71,15 @@ fn test_python_parse_other_cached() {
#[test]
fn test_python_parse_other_no_cache() {
Python::with_gil(|py| {
let obj = python_parse(py, br#"["string", true, false, null]"#, false, false).unwrap();
let obj = python_parse(py, br#"["string", true, false, null]"#, false, false, false).unwrap();
assert_eq!(obj.to_string(), "['string', True, False, None]");
})
}

#[test]
fn test_python_disallow_nan() {
Python::with_gil(|py| {
let r = python_parse(py, br#"[NaN]"#, false, true);
let r = python_parse(py, br#"[NaN]"#, false, true, false);
let e = r.map_err(|e| map_json_error(br#"[NaN]"#, &e)).unwrap_err();
assert_eq!(e.to_string(), "ValueError: expected value at line 1 column 2");
})
Expand All @@ -86,7 +89,7 @@ fn test_python_disallow_nan() {
fn test_error() {
Python::with_gil(|py| {
let bytes = br#"["string""#;
let r = python_parse(py, bytes, false, true);
let r = python_parse(py, bytes, false, true, false);
let e = r.map_err(|e| map_json_error(bytes, &e)).unwrap_err();
assert_eq!(e.to_string(), "ValueError: EOF while parsing a list at line 1 column 9");
})
Expand All @@ -98,7 +101,7 @@ fn test_recursion_limit() {
let bytes = json.as_bytes();

Python::with_gil(|py| {
let r = python_parse(py, bytes, false, true);
let r = python_parse(py, bytes, false, true, false);
let e = r.map_err(|e| map_json_error(bytes, &e)).unwrap_err();
assert_eq!(
e.to_string(),
Expand All @@ -114,24 +117,72 @@ fn test_recursion_limit_incr() {
let bytes = json.as_bytes();

Python::with_gil(|py| {
let v = python_parse(py, bytes, false, true).unwrap();
let v = python_parse(py, bytes, false, true, false).unwrap();
assert_eq!(v.len().unwrap(), 2000);
});

Python::with_gil(|py| {
let v = python_parse(py, bytes, false, true).unwrap();
let v = python_parse(py, bytes, false, true, false).unwrap();
assert_eq!(v.len().unwrap(), 2000);
});
}

#[test]
fn test_exected_value_error() {
fn test_extracted_value_error() {
let json = "xx";
let bytes = json.as_bytes();

Python::with_gil(|py| {
let r = python_parse(py, bytes, false, true);
let r = python_parse(py, bytes, false, true, false);
let e = r.map_err(|e| map_json_error(bytes, &e)).unwrap_err();
assert_eq!(e.to_string(), "ValueError: expected value at line 1 column 1");
})
}

#[test]
fn test_partial_array() {
Python::with_gil(|py| {
let bytes = br#"["string", true, null, 1, "foo"#;
let py_value = python_parse(py, bytes, false, true, true).unwrap();
let string = py_value.to_string();
assert_eq!(string, "['string', True, None, 1]");

// test that stopping at every points is ok
for i in 1..bytes.len() {
let py_value = python_parse(py, &bytes[..i], false, true, true).unwrap();
assert!(py_value.is_instance_of::<PyList>());
}
})
}

#[test]
fn test_partial_object() {
Python::with_gil(|py| {
let bytes = br#"{"a": 1, "b": 2, "c"#;
let py_value = python_parse(py, bytes, false, true, true).unwrap();
let string = py_value.to_string();
assert_eq!(string, "{'a': 1, 'b': 2}");

// test that stopping at every points is ok
for i in 1..bytes.len() {
let py_value = python_parse(py, &bytes[..i], false, true, true).unwrap();
assert!(py_value.is_instance_of::<PyDict>());
}
})
}

#[test]
fn test_partial_nested() {
Python::with_gil(|py| {
let bytes = br#"{"a": 1, "b": 2, "c": [1, 2, {"d": 1, "#;
let py_value = python_parse(py, bytes, false, true, true).unwrap();
let string = py_value.to_string();
assert_eq!(string, "{'a': 1, 'b': 2, 'c': [1, 2, {'d': 1}]}");

// test that stopping at every points is ok
for i in 1..bytes.len() {
let py_value = python_parse(py, &bytes[..i], false, true, true).unwrap();
assert!(py_value.is_instance_of::<PyDict>());
}
})
}

0 comments on commit 56db4fc

Please sign in to comment.