Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

use hashbrown to speedup python string caching #51

Merged
merged 1 commit into from Dec 4, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.toml
Expand Up @@ -17,6 +17,7 @@ ahash = "0.8.0"
smallvec = "1.11.0"
pyo3 = { version = "0.20.0", features = ["num-bigint"], optional = true }
lexical-core = { version = "0.8.5", features = ["format"] }
hashbrown = "0.14.3"

[features]
python = ["dep:pyo3"]
Expand Down
10 changes: 10 additions & 0 deletions benches/python.rs
Expand Up @@ -75,6 +75,14 @@ fn python_parse_string_array(bench: &mut Bencher) {
_python_parse_file("./benches/string_array.json", bench, true);
}

fn python_parse_string_array_unique_not_cached(bench: &mut Bencher) {
_python_parse_file("./benches/string_array_unique.json", bench, false);
}

fn python_parse_string_array_unique(bench: &mut Bencher) {
_python_parse_file("./benches/string_array_unique.json", bench, true);
}

fn python_parse_true_object(bench: &mut Bencher) {
_python_parse_file("./benches/true_object.json", bench, true);
}
Expand All @@ -93,6 +101,8 @@ benchmark_group!(
python_parse_true_object_not_cached,
python_parse_string_array_not_cached,
python_parse_string_array,
python_parse_string_array_unique_not_cached,
python_parse_string_array_unique,
python_parse_true_object,
python_parse_true_array,
);
Expand Down
1 change: 1 addition & 0 deletions benches/string_array_unique.json

Large diffs are not rendered by default.

37 changes: 20 additions & 17 deletions src/python.rs
Expand Up @@ -6,7 +6,7 @@ use pyo3::sync::{GILOnceCell, GILProtected};
use pyo3::types::{PyDict, PyList, PyString};
use pyo3::{ffi, AsPyPointer};

use ahash::AHashMap;
use hashbrown::hash_map::{HashMap, RawEntryMut};
use smallvec::SmallVec;

use crate::errors::{json_err, JsonError, JsonResult, DEFAULT_RECURSION_LIMIT};
Expand Down Expand Up @@ -149,30 +149,33 @@ struct StringCache;

impl StringMaybeCache for StringCache {
fn get(py: Python, json_str: &str) -> PyObject {
static STRINGS_CACHE: GILOnceCell<GILProtected<RefCell<AHashMap<String, PyObject>>>> = GILOnceCell::new();
static STRINGS_CACHE: GILOnceCell<GILProtected<RefCell<HashMap<String, PyObject>>>> = GILOnceCell::new();

// from tests, 0 and 1 character strings are faster not cached
if (2..64).contains(&json_str.len()) {
let cache = STRINGS_CACHE
.get_or_init(py, || GILProtected::new(RefCell::new(AHashMap::new())))
.get_or_init(py, || GILProtected::new(RefCell::new(HashMap::new())))
.get(py);

// Finish the borrow before matching, so that the RefCell isn't borrowed for the whole match.
let key = cache.borrow().get(json_str).map(|key| key.clone_ref(py));

match key {
Some(key) => key,
None => {
let key_object = PyString::new(py, json_str).to_object(py);
let mut cache_writable = cache.borrow_mut();
// 500k limit means 1m keys + values, 1m 64 byte strings is ~64mb
if cache_writable.len() > 500_000 {
cache_writable.clear();
}
cache_writable.insert(json_str.to_owned(), key_object.clone_ref(py));
key_object
let mut map = cache.borrow_mut();
let entry = map.raw_entry_mut().from_key(json_str);

let (py_string, inserted) = match entry {
RawEntryMut::Vacant(view) => {
let py_string = PyString::new(py, json_str).to_object(py);
view.insert(json_str.to_owned(), py_string.clone_ref(py));
(py_string, true)
}
RawEntryMut::Occupied(view) => (view.get().clone_ref(py), false),
};
if inserted {
// 500k limit means 1m keys + values, 1m 64 byte strings is ~64mb
if map.len() > 500_000 {
// TODO is there a fast way to keep (say) half the cache?
map.clear();
}
}
py_string
} else {
let key = PyString::new(py, json_str);
key.to_object(py)
Expand Down