Skip to content

Commit

Permalink
use hashbrown to speedup python string caching (#51)
Browse files Browse the repository at this point in the history
Co-authored-by: Samuel Colvin <s@muelcolvin.com>
  • Loading branch information
davidhewitt and samuelcolvin committed Dec 4, 2023
1 parent c7deb6d commit 849d5b8
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 17 deletions.
1 change: 1 addition & 0 deletions Cargo.toml
Expand Up @@ -17,6 +17,7 @@ ahash = "0.8.0"
smallvec = "1.11.0"
pyo3 = { version = "0.20.0", features = ["num-bigint"], optional = true }
lexical-core = { version = "0.8.5", features = ["format"] }
hashbrown = "0.14.3"

[features]
python = ["dep:pyo3"]
Expand Down
10 changes: 10 additions & 0 deletions benches/python.rs
Expand Up @@ -75,6 +75,14 @@ fn python_parse_string_array(bench: &mut Bencher) {
_python_parse_file("./benches/string_array.json", bench, true);
}

fn python_parse_string_array_unique_not_cached(bench: &mut Bencher) {
_python_parse_file("./benches/string_array_unique.json", bench, false);
}

fn python_parse_string_array_unique(bench: &mut Bencher) {
_python_parse_file("./benches/string_array_unique.json", bench, true);
}

fn python_parse_true_object(bench: &mut Bencher) {
_python_parse_file("./benches/true_object.json", bench, true);
}
Expand All @@ -93,6 +101,8 @@ benchmark_group!(
python_parse_true_object_not_cached,
python_parse_string_array_not_cached,
python_parse_string_array,
python_parse_string_array_unique_not_cached,
python_parse_string_array_unique,
python_parse_true_object,
python_parse_true_array,
);
Expand Down
1 change: 1 addition & 0 deletions benches/string_array_unique.json

Large diffs are not rendered by default.

37 changes: 20 additions & 17 deletions src/python.rs
Expand Up @@ -6,7 +6,7 @@ use pyo3::sync::{GILOnceCell, GILProtected};
use pyo3::types::{PyDict, PyList, PyString};
use pyo3::{ffi, AsPyPointer};

use ahash::AHashMap;
use hashbrown::hash_map::{HashMap, RawEntryMut};
use smallvec::SmallVec;

use crate::errors::{json_err, JsonError, JsonResult, DEFAULT_RECURSION_LIMIT};
Expand Down Expand Up @@ -151,30 +151,33 @@ struct StringCache;

impl StringMaybeCache for StringCache {
fn get(py: Python, json_str: &str) -> PyObject {
static STRINGS_CACHE: GILOnceCell<GILProtected<RefCell<AHashMap<String, PyObject>>>> = GILOnceCell::new();
static STRINGS_CACHE: GILOnceCell<GILProtected<RefCell<HashMap<String, PyObject>>>> = GILOnceCell::new();

// from tests, 0 and 1 character strings are faster not cached
if (2..64).contains(&json_str.len()) {
let cache = STRINGS_CACHE
.get_or_init(py, || GILProtected::new(RefCell::new(AHashMap::new())))
.get_or_init(py, || GILProtected::new(RefCell::new(HashMap::new())))
.get(py);

// Finish the borrow before matching, so that the RefCell isn't borrowed for the whole match.
let key = cache.borrow().get(json_str).map(|key| key.clone_ref(py));

match key {
Some(key) => key,
None => {
let key_object = PyString::new(py, json_str).to_object(py);
let mut cache_writable = cache.borrow_mut();
// 500k limit means 1m keys + values, 1m 64 byte strings is ~64mb
if cache_writable.len() > 500_000 {
cache_writable.clear();
}
cache_writable.insert(json_str.to_owned(), key_object.clone_ref(py));
key_object
let mut map = cache.borrow_mut();
let entry = map.raw_entry_mut().from_key(json_str);

let (py_string, inserted) = match entry {
RawEntryMut::Vacant(view) => {
let py_string = PyString::new(py, json_str).to_object(py);
view.insert(json_str.to_owned(), py_string.clone_ref(py));
(py_string, true)
}
RawEntryMut::Occupied(view) => (view.get().clone_ref(py), false),
};
if inserted {
// 500k limit means 1m keys + values, 1m 64 byte strings is ~64mb
if map.len() > 500_000 {
// TODO is there a fast way to keep (say) half the cache?
map.clear();
}
}
py_string
} else {
let key = PyString::new(py, json_str);
key.to_object(py)
Expand Down

0 comments on commit 849d5b8

Please sign in to comment.