Merge pull request #11 from huonw/speed++

Optimise bucket placement.
rust-phf · Aug 30, 2014 · 3687c0e · 3687c0e
2 parents af0a11c + a48f95a
commit 3687c0e
Show file tree

Hide file tree

Showing 2 changed files with 24 additions and 6 deletions.
diff --git a/README.md b/README.md
@@ -8,7 +8,7 @@ Rust-PHF is a library to generate efficient lookup tables at compile time using
 
 It currently uses the
 [CHD algorithm](http://cmph.sourceforge.net/papers/esa09.pdf) and can generate
-a 10,000 entry map in roughly .25 seconds.
+a 100,000 entry map in roughly .4 seconds.
 
 Documentation is available at http://www.rust-ci.org/sfackler/rust-phf/doc/phf/.
 

diff --git a/phf_mac/src/lib.rs b/phf_mac/src/lib.rs
@@ -359,23 +359,41 @@ fn try_generate_hash(entries: &[Entry], rng: &mut XorShiftRng) -> Option<HashSta
     let table_len = entries.len();
     let mut map = Vec::from_elem(table_len, None);
     let mut disps = Vec::from_elem(buckets_len, (0u32, 0u32));
-    let mut try_map = HashMap::new();
+
+    // store whether an element from the bucket being placed is
+    // located at a certain position, to allow for efficient overlap
+    // checks. It works by storing the generation in each cell and
+    // each new placement-attempt is a new generation, so you can tell
+    // if this is legitimately full by checking that the generations
+    // are equal. (A u64 is far too large to overflow in a reasonable
+    // time for current hardware.)
+    let mut try_map = Vec::from_elem(table_len, 0u64);
+    let mut generation = 0u64;
+
+    // the actual values corresponding to the markers above, as
+    // (index, key) pairs, for adding to the main map once we've
+    // chosen the right disps.
+    let mut values_to_add = vec![];
+
     'buckets: for bucket in buckets.iter() {
         for d1 in range(0, table_len as u32) {
             'disps: for d2 in range(0, table_len as u32) {
-                try_map.clear();
+                values_to_add.clear();
+                generation += 1;
+
                 for &key in bucket.keys.iter() {
                     let idx = (shared::displace(hashes[key].f1, hashes[key].f2, d1, d2)
                                 % (table_len as u32)) as uint;
-                    if map[idx].is_some() || try_map.find(&idx).is_some() {
+                    if map[idx].is_some() || try_map[idx] == generation {
                         continue 'disps;
                     }
-                    try_map.insert(idx, key);
+                    *try_map.get_mut(idx) = generation;
+                    values_to_add.push((idx, key));
                 }
 
                 // We've picked a good set of disps
                 *disps.get_mut(bucket.idx) = (d1, d2);
-                for (&idx, &key) in try_map.iter() {
+                for &(idx, key) in values_to_add.iter() {
                     *map.get_mut(idx) = Some(key);
                 }
                 continue 'buckets;