Skip to content

Commit

Permalink
Add k_smallest_relaxed and variants
Browse files Browse the repository at this point in the history
This implements the algorithm described in [1] which consumes twice the amount
of memory as the existing `k_smallest` algorithm but achieves linear time in the
number of elements in the input.

[1] https://quickwit.io/blog/top-k-complexity
  • Loading branch information
adamreichold committed Apr 27, 2024
1 parent dd6a569 commit 19747d1
Show file tree
Hide file tree
Showing 3 changed files with 274 additions and 2 deletions.
43 changes: 43 additions & 0 deletions src/k_smallest.rs
Expand Up @@ -88,6 +88,49 @@ where
storage
}

pub(crate) fn k_smallest_relaxed_general<I, F>(
mut iter: I,
k: usize,
mut comparator: F,
) -> Vec<I::Item>
where
I: Iterator,
F: FnMut(&I::Item, &I::Item) -> Ordering,
{
if k == 0 {
iter.last();
return Vec::new();
}

let mut buf = Vec::with_capacity(2 * k);
buf.extend(iter.by_ref().take(2 * k));

if buf.len() < k {
buf.sort_unstable_by(comparator);
return buf;
}

buf.select_nth_unstable_by(k - 1, |a, b| comparator(a, b));
buf.truncate(k);

iter.for_each(|val| {
if comparator(&val, &buf[k - 1]) != Ordering::Less {
return;
}

buf.push(val);

if buf.len() == 2 * k {
buf.select_nth_unstable_by(k - 1, |a, b| comparator(a, b));
buf.truncate(k);
}
});

buf.sort_unstable_by(comparator);
buf.truncate(k);
buf
}

#[inline]
pub(crate) fn key_to_cmp<T, K, F>(mut key: F) -> impl FnMut(&T, &T) -> Ordering
where
Expand Down
187 changes: 187 additions & 0 deletions src/lib.rs
Expand Up @@ -3060,6 +3060,105 @@ pub trait Itertools: Iterator {
self.k_smallest_by(k, k_smallest::key_to_cmp(key))
}

/// Sort the k smallest elements into a new iterator, in ascending order, relaxing the amount of memory required.
///
/// **Note:** This consumes the entire iterator, and returns the result
/// as a new iterator that owns its elements. If the input contains
/// less than k elements, the result is equivalent to `self.sorted()`.
///
/// This is guaranteed to use `2 * k * sizeof(Self::Item) + O(1)` memory
/// and `O(n + k log k)` time, with `n` the number of elements in the input,
/// meaning it uses more memory than the minimum obtained by [`k_smallest`](Itertools::k_smallest)
/// but achieves linear time in the number of elements.
///
/// The sorted iterator, if directly collected to a `Vec`, is converted
/// without any extra copying or allocation cost.
///
/// **Note:** This is functionally-equivalent to `self.sorted().take(k)`
/// but much more efficient.
///
/// ```
/// use itertools::Itertools;
///
/// // A random permutation of 0..15
/// let numbers = vec![6, 9, 1, 14, 0, 4, 8, 7, 11, 2, 10, 3, 13, 12, 5];
///
/// let five_smallest = numbers
/// .into_iter()
/// .k_smallest_relaxed(5);
///
/// itertools::assert_equal(five_smallest, 0..5);
/// ```
#[cfg(feature = "use_alloc")]
fn k_smallest_relaxed(self, k: usize) -> VecIntoIter<Self::Item>
where
Self: Sized,
Self::Item: Ord,
{
self.k_smallest_by(k, Ord::cmp)
}

/// Sort the k smallest elements into a new iterator using the provided comparison, relaxing the amount of memory required.
///
/// The sorted iterator, if directly collected to a `Vec`, is converted
/// without any extra copying or allocation cost.
///
/// This corresponds to `self.sorted_by(cmp).take(k)` in the same way that
/// [`k_smallest_relaxed`](Itertools::k_smallest_relaxed) corresponds to `self.sorted().take(k)`,
/// in both semantics and complexity.
///
/// ```
/// use itertools::Itertools;
///
/// // A random permutation of 0..15
/// let numbers = vec![6, 9, 1, 14, 0, 4, 8, 7, 11, 2, 10, 3, 13, 12, 5];
///
/// let five_smallest = numbers
/// .into_iter()
/// .k_smallest_relaxed_by(5, |a, b| (a % 7).cmp(&(b % 7)).then(a.cmp(b)));
///
/// itertools::assert_equal(five_smallest, vec![0, 7, 14, 1, 8]);
/// ```
#[cfg(feature = "use_alloc")]
fn k_smallest_relaxed_by<F>(self, k: usize, cmp: F) -> VecIntoIter<Self::Item>
where
Self: Sized,
F: FnMut(&Self::Item, &Self::Item) -> Ordering,
{
k_smallest::k_smallest_relaxed_general(self, k, cmp).into_iter()
}

/// Return the elements producing the k smallest outputs of the provided function, relaxing the amount of memory required.
///
/// The sorted iterator, if directly collected to a `Vec`, is converted
/// without any extra copying or allocation cost.
///
/// This corresponds to `self.sorted_by_key(key).take(k)` in the same way that
/// [`k_smallest_relaxed`](Itertools::k_smallest_relaxed) corresponds to `self.sorted().take(k)`,
/// in both semantics and complexity.
///
/// ```
/// use itertools::Itertools;
///
/// // A random permutation of 0..15
/// let numbers = vec![6, 9, 1, 14, 0, 4, 8, 7, 11, 2, 10, 3, 13, 12, 5];
///
/// let five_smallest = numbers
/// .into_iter()
/// .k_smallest_by_key(5, |n| (n % 7, *n));
///
/// itertools::assert_equal(five_smallest, vec![0, 7, 14, 1, 8]);
/// ```
#[cfg(feature = "use_alloc")]
fn k_smallest_relaxed_by_key<F, K>(self, k: usize, key: F) -> VecIntoIter<Self::Item>
where
Self: Sized,
F: FnMut(&Self::Item) -> K,
K: Ord,
{
self.k_smallest_by(k, k_smallest::key_to_cmp(key))
}

/// Sort the k largest elements into a new iterator, in descending order.
///
/// The sorted iterator, if directly collected to a `Vec`, is converted
Expand Down Expand Up @@ -3150,6 +3249,94 @@ pub trait Itertools: Iterator {
self.k_largest_by(k, k_smallest::key_to_cmp(key))
}

/// Sort the k largest elements into a new iterator, in descending order, relaxing the amount of memory required.
///
/// The sorted iterator, if directly collected to a `Vec`, is converted
/// without any extra copying or allocation cost.
///
/// It is semantically equivalent to [`k_smallest_relaxed`](Itertools::k_smallest_relaxed)
/// with a reversed `Ord`.
///
/// ```
/// use itertools::Itertools;
///
/// // A random permutation of 0..15
/// let numbers = vec![6, 9, 1, 14, 0, 4, 8, 7, 11, 2, 10, 3, 13, 12, 5];
///
/// let five_largest = numbers
/// .into_iter()
/// .k_largest_relaxed(5);
///
/// itertools::assert_equal(five_largest, vec![14, 13, 12, 11, 10]);
/// ```
#[cfg(feature = "use_alloc")]
fn k_largest_relaxed(self, k: usize) -> VecIntoIter<Self::Item>
where
Self: Sized,
Self::Item: Ord,
{
self.k_largest_relaxed_by(k, Self::Item::cmp)
}

/// Sort the k largest elements into a new iterator using the provided comparison, relaxing the amount of memory required.
///
/// The sorted iterator, if directly collected to a `Vec`, is converted
/// without any extra copying or allocation cost.
///
/// Functionally equivalent to [`k_smallest_relaxed_by`](Itertools::k_smallest_reaxled_by)
/// with a reversed `Ord`.
///
/// ```
/// use itertools::Itertools;
///
/// // A random permutation of 0..15
/// let numbers = vec![6, 9, 1, 14, 0, 4, 8, 7, 11, 2, 10, 3, 13, 12, 5];
///
/// let five_largest = numbers
/// .into_iter()
/// .k_largest_relaxed_by(5, |a, b| (a % 7).cmp(&(b % 7)).then(a.cmp(b)));
///
/// itertools::assert_equal(five_largest, vec![13, 6, 12, 5, 11]);
/// ```
#[cfg(feature = "use_alloc")]
fn k_largest_relaxed_by<F>(self, k: usize, mut cmp: F) -> VecIntoIter<Self::Item>
where
Self: Sized,
F: FnMut(&Self::Item, &Self::Item) -> Ordering,
{
self.k_smallest_relaxed_by(k, move |a, b| cmp(b, a))
}

/// Return the elements producing the k largest outputs of the provided function, relaxing the amount of memory required.
///
/// The sorted iterator, if directly collected to a `Vec`, is converted
/// without any extra copying or allocation cost.
///
/// Functionally equivalent to [`k_smallest_relaxed_by_key`](Itertools::k_smallest_relaxed_by_key)
/// with a reversed `Ord`.
///
/// ```
/// use itertools::Itertools;
///
/// // A random permutation of 0..15
/// let numbers = vec![6, 9, 1, 14, 0, 4, 8, 7, 11, 2, 10, 3, 13, 12, 5];
///
/// let five_largest = numbers
/// .into_iter()
/// .k_largest_relaxed_by_key(5, |n| (n % 7, *n));
///
/// itertools::assert_equal(five_largest, vec![13, 6, 12, 5, 11]);
/// ```
#[cfg(feature = "use_alloc")]
fn k_largest_relaxed_by_key<F, K>(self, k: usize, key: F) -> VecIntoIter<Self::Item>
where
Self: Sized,
F: FnMut(&Self::Item) -> K,
K: Ord,
{
self.k_largest_relaxed_by(k, k_smallest::key_to_cmp(key))
}

/// Consumes the iterator and return an iterator of the last `n` elements.
///
/// The iterator, if directly collected to a `VecDeque`, is converted
Expand Down
46 changes: 44 additions & 2 deletions tests/test_std.rs
Expand Up @@ -527,6 +527,42 @@ qc::quickcheck! {
it::assert_equal(largest_by, sorted_largest.clone());
it::assert_equal(largest_by_key, sorted_largest);
}

fn k_smallest_relaxed_range(n: i64, m: u16, k: u16) -> () {
// u16 is used to constrain k and m to 0..2¹⁶,
// otherwise the test could use too much memory.
let (k, m) = (k as usize, m as u64);

let mut v: Vec<_> = (n..n.saturating_add(m as _)).collect();
// Generate a random permutation of n..n+m
v.shuffle(&mut thread_rng());

// Construct the right answers for the top and bottom elements
let mut sorted = v.clone();
sorted.sort();
// how many elements are we checking
let num_elements = min(k, m as _);

// Compute the top and bottom k in various combinations
let sorted_smallest = sorted[..num_elements].iter().cloned();
let smallest = v.iter().cloned().k_smallest_relaxed(k);
let smallest_by = v.iter().cloned().k_smallest_relaxed_by(k, Ord::cmp);
let smallest_by_key = v.iter().cloned().k_smallest_relaxed_by_key(k, |&x| x);

let sorted_largest = sorted[sorted.len() - num_elements..].iter().rev().cloned();
let largest = v.iter().cloned().k_largest_relaxed(k);
let largest_by = v.iter().cloned().k_largest_relaxed_by(k, Ord::cmp);
let largest_by_key = v.iter().cloned().k_largest_relaxed_by_key(k, |&x| x);

// Check the variations produce the same answers and that they're right
it::assert_equal(smallest, sorted_smallest.clone());
it::assert_equal(smallest_by, sorted_smallest.clone());
it::assert_equal(smallest_by_key, sorted_smallest);

it::assert_equal(largest, sorted_largest.clone());
it::assert_equal(largest_by, sorted_largest.clone());
it::assert_equal(largest_by_key, sorted_largest);
}
}

#[derive(Clone, Debug)]
Expand Down Expand Up @@ -571,8 +607,11 @@ where
I::Item: Ord + Debug,
{
let j = i.clone();
let i1 = i.clone();
let j1 = i.clone();
let k = k as usize;
it::assert_equal(i.k_smallest(k), j.sorted().take(k))
it::assert_equal(i.k_smallest(k), j.sorted().take(k));
it::assert_equal(i1.k_smallest_relaxed(k), j1.sorted().take(k));
}

// Similar to `k_smallest_sort` but for our custom heap implementation.
Expand All @@ -582,8 +621,11 @@ where
I::Item: Ord + Debug,
{
let j = i.clone();
let i1 = i.clone();
let j1 = i.clone();
let k = k as usize;
it::assert_equal(i.k_smallest_by(k, Ord::cmp), j.sorted().take(k))
it::assert_equal(i.k_smallest_by(k, Ord::cmp), j.sorted().take(k));
it::assert_equal(i1.k_smallest_relaxed_by(k, Ord::cmp), j1.sorted().take(k));
}

macro_rules! generic_test {
Expand Down

0 comments on commit 19747d1

Please sign in to comment.