use hashbrown to speedup python string caching (#51)

Co-authored-by: Samuel Colvin <s@muelcolvin.com>
pydantic · Dec 4, 2023 · 849d5b8 · 849d5b8
1 parent c7deb6d
commit 849d5b8
Show file tree

Hide file tree

Showing 4 changed files with 32 additions and 17 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -17,6 +17,7 @@ ahash = "0.8.0"
 smallvec = "1.11.0"
 pyo3 = { version = "0.20.0", features = ["num-bigint"], optional = true }
 lexical-core = { version = "0.8.5", features = ["format"] }
+hashbrown = "0.14.3"
 
 [features]
 python = ["dep:pyo3"]

diff --git a/benches/python.rs b/benches/python.rs
@@ -75,6 +75,14 @@ fn python_parse_string_array(bench: &mut Bencher) {
     _python_parse_file("./benches/string_array.json", bench, true);
 }
 
+fn python_parse_string_array_unique_not_cached(bench: &mut Bencher) {
+    _python_parse_file("./benches/string_array_unique.json", bench, false);
+}
+
+fn python_parse_string_array_unique(bench: &mut Bencher) {
+    _python_parse_file("./benches/string_array_unique.json", bench, true);
+}
+
 fn python_parse_true_object(bench: &mut Bencher) {
     _python_parse_file("./benches/true_object.json", bench, true);
 }
@@ -93,6 +101,8 @@ benchmark_group!(
     python_parse_true_object_not_cached,
     python_parse_string_array_not_cached,
     python_parse_string_array,
+    python_parse_string_array_unique_not_cached,
+    python_parse_string_array_unique,
     python_parse_true_object,
     python_parse_true_array,
 );

diff --git a/benches/string_array_unique.json b/benches/string_array_unique.json
diff --git a/src/python.rs b/src/python.rs
@@ -6,7 +6,7 @@ use pyo3::sync::{GILOnceCell, GILProtected};
 use pyo3::types::{PyDict, PyList, PyString};
 use pyo3::{ffi, AsPyPointer};
 
-use ahash::AHashMap;
+use hashbrown::hash_map::{HashMap, RawEntryMut};
 use smallvec::SmallVec;
 
 use crate::errors::{json_err, JsonError, JsonResult, DEFAULT_RECURSION_LIMIT};
@@ -151,30 +151,33 @@ struct StringCache;
 
 impl StringMaybeCache for StringCache {
     fn get(py: Python, json_str: &str) -> PyObject {
-        static STRINGS_CACHE: GILOnceCell<GILProtected<RefCell<AHashMap<String, PyObject>>>> = GILOnceCell::new();
+        static STRINGS_CACHE: GILOnceCell<GILProtected<RefCell<HashMap<String, PyObject>>>> = GILOnceCell::new();
 
         // from tests, 0 and 1 character strings are faster not cached
         if (2..64).contains(&json_str.len()) {
             let cache = STRINGS_CACHE
-                .get_or_init(py, || GILProtected::new(RefCell::new(AHashMap::new())))
+                .get_or_init(py, || GILProtected::new(RefCell::new(HashMap::new())))
                 .get(py);
 
-            // Finish the borrow before matching, so that the RefCell isn't borrowed for the whole match.
-            let key = cache.borrow().get(json_str).map(|key| key.clone_ref(py));
-
-            match key {
-                Some(key) => key,
-                None => {
-                    let key_object = PyString::new(py, json_str).to_object(py);
-                    let mut cache_writable = cache.borrow_mut();
-                    // 500k limit means 1m keys + values, 1m 64 byte strings is ~64mb
-                    if cache_writable.len() > 500_000 {
-                        cache_writable.clear();
-                    }
-                    cache_writable.insert(json_str.to_owned(), key_object.clone_ref(py));
-                    key_object
+            let mut map = cache.borrow_mut();
+            let entry = map.raw_entry_mut().from_key(json_str);
+
+            let (py_string, inserted) = match entry {
+                RawEntryMut::Vacant(view) => {
+                    let py_string = PyString::new(py, json_str).to_object(py);
+                    view.insert(json_str.to_owned(), py_string.clone_ref(py));
+                    (py_string, true)
+                }
+                RawEntryMut::Occupied(view) => (view.get().clone_ref(py), false),
+            };
+            if inserted {
+                // 500k limit means 1m keys + values, 1m 64 byte strings is ~64mb
+                if map.len() > 500_000 {
+                    // TODO is there a fast way to keep (say) half the cache?
+                    map.clear();
                 }
             }
+            py_string
         } else {
             let key = PyString::new(py, json_str);
             key.to_object(py)