Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add k_smallest_relaxed and variants #925

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
6 changes: 5 additions & 1 deletion Cargo.toml
Expand Up @@ -27,7 +27,7 @@ either = { version = "1.0", default-features = false }

[dev-dependencies]
rand = "0.7"
criterion = "0.4.0"
criterion = { version = "0.4.0", features = ["html_reports"] }
paste = "1.0.0" # Used in test_std to instantiate generic tests
permutohedron = "0.2"
quickcheck = { version = "0.9", default_features = false }
Expand Down Expand Up @@ -75,3 +75,7 @@ harness = false
[[bench]]
name = "specializations"
harness = false

[[bench]]
name = "k_smallest"
harness = false
61 changes: 61 additions & 0 deletions benches/k_smallest.rs
@@ -0,0 +1,61 @@
use criterion::{black_box, criterion_group, criterion_main, Bencher, BenchmarkId, Criterion};
use itertools::Itertools;
use rand::{rngs::StdRng, seq::SliceRandom, SeedableRng};

fn strict(b: &mut Bencher, (k, vals): &(usize, &Vec<usize>)) {
b.iter(|| black_box(vals.iter()).k_smallest(*k))
}

fn relaxed(b: &mut Bencher, (k, vals): &(usize, &Vec<usize>)) {
b.iter(|| black_box(vals.iter()).k_smallest_relaxed(*k))
}

fn ascending(n: usize) -> Vec<usize> {
(0..n).collect()
}

fn random(n: usize) -> Vec<usize> {
let mut vals = (0..n).collect_vec();
vals.shuffle(&mut StdRng::seed_from_u64(42));
vals
}

fn descending(n: usize) -> Vec<usize> {
(0..n).rev().collect()
}

fn k_smallest(c: &mut Criterion, order: &str, vals: fn(usize) -> Vec<usize>) {
let mut g = c.benchmark_group(format!("k-smallest/{order}"));

for log_n in 20..23 {
let n = 1 << log_n;

let vals = vals(n);

for log_k in 7..10 {
let k = 1 << log_k;

let params = format!("{log_n}/{log_k}");
let input = (k, &vals);
g.bench_with_input(BenchmarkId::new("strict", &params), &input, strict);
g.bench_with_input(BenchmarkId::new("relaxed", &params), &input, relaxed);
}
}

g.finish()
}

fn k_smallest_asc(c: &mut Criterion) {
k_smallest(c, "asc", ascending);
}

fn k_smallest_rand(c: &mut Criterion) {
k_smallest(c, "rand", random);
}

fn k_smallest_desc(c: &mut Criterion) {
k_smallest(c, "desc", descending);
}

criterion_group!(benches, k_smallest_asc, k_smallest_rand, k_smallest_desc);
criterion_main!(benches);
40 changes: 40 additions & 0 deletions src/k_smallest.rs
Expand Up @@ -88,6 +88,46 @@ where
storage
}

pub(crate) fn k_smallest_relaxed_general<I, F>(iter: I, k: usize, mut comparator: F) -> Vec<I::Item>
where
I: Iterator,
F: FnMut(&I::Item, &I::Item) -> Ordering,
{
if k == 0 {
iter.last();
return Vec::new();
}

let mut iter = iter.fuse();
let mut buf = iter.by_ref().take(2 * k).collect::<Vec<_>>();

if buf.len() < k {
buf.sort_unstable_by(&mut comparator);
return buf;
}

buf.select_nth_unstable_by(k - 1, &mut comparator);
buf.truncate(k);

iter.for_each(|val| {
if comparator(&val, &buf[k - 1]) != Ordering::Less {
return;
}

assert_ne!(buf.len(), buf.capacity());
buf.push(val);
adamreichold marked this conversation as resolved.
Show resolved Hide resolved

if buf.len() == 2 * k {
buf.select_nth_unstable_by(k - 1, &mut comparator);
buf.truncate(k);
}
});

buf.sort_unstable_by(&mut comparator);
buf.truncate(k);
buf
}

#[inline]
pub(crate) fn key_to_cmp<T, K, F>(mut key: F) -> impl FnMut(&T, &T) -> Ordering
where
Expand Down
187 changes: 187 additions & 0 deletions src/lib.rs
Expand Up @@ -3060,6 +3060,105 @@ pub trait Itertools: Iterator {
self.k_smallest_by(k, k_smallest::key_to_cmp(key))
}

/// Sort the k smallest elements into a new iterator, in ascending order, relaxing the amount of memory required.
///
/// **Note:** This consumes the entire iterator, and returns the result
/// as a new iterator that owns its elements. If the input contains
/// less than k elements, the result is equivalent to `self.sorted()`.
///
/// This is guaranteed to use `2 * k * sizeof(Self::Item) + O(1)` memory
/// and `O(n + k log k)` time, with `n` the number of elements in the input,
/// meaning it uses more memory than the minimum obtained by [`k_smallest`](Itertools::k_smallest)
/// but achieves linear time in the number of elements.
///
/// The sorted iterator, if directly collected to a `Vec`, is converted
/// without any extra copying or allocation cost.
///
/// **Note:** This is functionally-equivalent to `self.sorted().take(k)`
/// but much more efficient.
///
/// ```
/// use itertools::Itertools;
///
/// // A random permutation of 0..15
/// let numbers = vec![6, 9, 1, 14, 0, 4, 8, 7, 11, 2, 10, 3, 13, 12, 5];
///
/// let five_smallest = numbers
/// .into_iter()
/// .k_smallest_relaxed(5);
///
/// itertools::assert_equal(five_smallest, 0..5);
/// ```
#[cfg(feature = "use_alloc")]
fn k_smallest_relaxed(self, k: usize) -> VecIntoIter<Self::Item>
where
Self: Sized,
Self::Item: Ord,
{
self.k_smallest_relaxed_by(k, Ord::cmp)
}

/// Sort the k smallest elements into a new iterator using the provided comparison, relaxing the amount of memory required.
///
/// The sorted iterator, if directly collected to a `Vec`, is converted
/// without any extra copying or allocation cost.
///
/// This corresponds to `self.sorted_by(cmp).take(k)` in the same way that
/// [`k_smallest_relaxed`](Itertools::k_smallest_relaxed) corresponds to `self.sorted().take(k)`,
/// in both semantics and complexity.
///
/// ```
/// use itertools::Itertools;
///
/// // A random permutation of 0..15
/// let numbers = vec![6, 9, 1, 14, 0, 4, 8, 7, 11, 2, 10, 3, 13, 12, 5];
///
/// let five_smallest = numbers
/// .into_iter()
/// .k_smallest_relaxed_by(5, |a, b| (a % 7).cmp(&(b % 7)).then(a.cmp(b)));
///
/// itertools::assert_equal(five_smallest, vec![0, 7, 14, 1, 8]);
/// ```
#[cfg(feature = "use_alloc")]
fn k_smallest_relaxed_by<F>(self, k: usize, cmp: F) -> VecIntoIter<Self::Item>
where
Self: Sized,
F: FnMut(&Self::Item, &Self::Item) -> Ordering,
{
k_smallest::k_smallest_relaxed_general(self, k, cmp).into_iter()
}

/// Return the elements producing the k smallest outputs of the provided function, relaxing the amount of memory required.
///
/// The sorted iterator, if directly collected to a `Vec`, is converted
/// without any extra copying or allocation cost.
///
/// This corresponds to `self.sorted_by_key(key).take(k)` in the same way that
/// [`k_smallest_relaxed`](Itertools::k_smallest_relaxed) corresponds to `self.sorted().take(k)`,
/// in both semantics and complexity.
///
/// ```
/// use itertools::Itertools;
///
/// // A random permutation of 0..15
/// let numbers = vec![6, 9, 1, 14, 0, 4, 8, 7, 11, 2, 10, 3, 13, 12, 5];
///
/// let five_smallest = numbers
/// .into_iter()
/// .k_smallest_relaxed_by_key(5, |n| (n % 7, *n));
///
/// itertools::assert_equal(five_smallest, vec![0, 7, 14, 1, 8]);
/// ```
#[cfg(feature = "use_alloc")]
fn k_smallest_relaxed_by_key<F, K>(self, k: usize, key: F) -> VecIntoIter<Self::Item>
where
Self: Sized,
F: FnMut(&Self::Item) -> K,
K: Ord,
{
self.k_smallest_relaxed_by(k, k_smallest::key_to_cmp(key))
}

/// Sort the k largest elements into a new iterator, in descending order.
///
/// The sorted iterator, if directly collected to a `Vec`, is converted
Expand Down Expand Up @@ -3150,6 +3249,94 @@ pub trait Itertools: Iterator {
self.k_largest_by(k, k_smallest::key_to_cmp(key))
}

/// Sort the k largest elements into a new iterator, in descending order, relaxing the amount of memory required.
///
/// The sorted iterator, if directly collected to a `Vec`, is converted
/// without any extra copying or allocation cost.
///
/// It is semantically equivalent to [`k_smallest_relaxed`](Itertools::k_smallest_relaxed)
/// with a reversed `Ord`.
///
/// ```
/// use itertools::Itertools;
///
/// // A random permutation of 0..15
/// let numbers = vec![6, 9, 1, 14, 0, 4, 8, 7, 11, 2, 10, 3, 13, 12, 5];
///
/// let five_largest = numbers
/// .into_iter()
/// .k_largest_relaxed(5);
///
/// itertools::assert_equal(five_largest, vec![14, 13, 12, 11, 10]);
/// ```
#[cfg(feature = "use_alloc")]
fn k_largest_relaxed(self, k: usize) -> VecIntoIter<Self::Item>
where
Self: Sized,
Self::Item: Ord,
{
self.k_largest_relaxed_by(k, Self::Item::cmp)
}

/// Sort the k largest elements into a new iterator using the provided comparison, relaxing the amount of memory required.
///
/// The sorted iterator, if directly collected to a `Vec`, is converted
/// without any extra copying or allocation cost.
///
/// Functionally equivalent to [`k_smallest_relaxed_by`](Itertools::k_smallest_relaxed_by)
/// with a reversed `Ord`.
///
/// ```
/// use itertools::Itertools;
///
/// // A random permutation of 0..15
/// let numbers = vec![6, 9, 1, 14, 0, 4, 8, 7, 11, 2, 10, 3, 13, 12, 5];
///
/// let five_largest = numbers
/// .into_iter()
/// .k_largest_relaxed_by(5, |a, b| (a % 7).cmp(&(b % 7)).then(a.cmp(b)));
///
/// itertools::assert_equal(five_largest, vec![13, 6, 12, 5, 11]);
/// ```
#[cfg(feature = "use_alloc")]
fn k_largest_relaxed_by<F>(self, k: usize, mut cmp: F) -> VecIntoIter<Self::Item>
where
Self: Sized,
F: FnMut(&Self::Item, &Self::Item) -> Ordering,
{
self.k_smallest_relaxed_by(k, move |a, b| cmp(b, a))
}

/// Return the elements producing the k largest outputs of the provided function, relaxing the amount of memory required.
///
/// The sorted iterator, if directly collected to a `Vec`, is converted
/// without any extra copying or allocation cost.
///
/// Functionally equivalent to [`k_smallest_relaxed_by_key`](Itertools::k_smallest_relaxed_by_key)
/// with a reversed `Ord`.
///
/// ```
/// use itertools::Itertools;
///
/// // A random permutation of 0..15
/// let numbers = vec![6, 9, 1, 14, 0, 4, 8, 7, 11, 2, 10, 3, 13, 12, 5];
///
/// let five_largest = numbers
/// .into_iter()
/// .k_largest_relaxed_by_key(5, |n| (n % 7, *n));
///
/// itertools::assert_equal(five_largest, vec![13, 6, 12, 5, 11]);
/// ```
#[cfg(feature = "use_alloc")]
fn k_largest_relaxed_by_key<F, K>(self, k: usize, key: F) -> VecIntoIter<Self::Item>
where
Self: Sized,
F: FnMut(&Self::Item) -> K,
K: Ord,
{
self.k_largest_relaxed_by(k, k_smallest::key_to_cmp(key))
}

/// Consumes the iterator and return an iterator of the last `n` elements.
///
/// The iterator, if directly collected to a `VecDeque`, is converted
Expand Down
46 changes: 44 additions & 2 deletions tests/test_std.rs
Expand Up @@ -527,6 +527,42 @@ qc::quickcheck! {
it::assert_equal(largest_by, sorted_largest.clone());
it::assert_equal(largest_by_key, sorted_largest);
}

fn k_smallest_relaxed_range(n: i64, m: u16, k: u16) -> () {
// u16 is used to constrain k and m to 0..2¹⁶,
// otherwise the test could use too much memory.
let (k, m) = (k as usize, m as u64);

let mut v: Vec<_> = (n..n.saturating_add(m as _)).collect();
// Generate a random permutation of n..n+m
v.shuffle(&mut thread_rng());

// Construct the right answers for the top and bottom elements
let mut sorted = v.clone();
sorted.sort();
// how many elements are we checking
let num_elements = min(k, m as _);

// Compute the top and bottom k in various combinations
let sorted_smallest = sorted[..num_elements].iter().cloned();
let smallest = v.iter().cloned().k_smallest_relaxed(k);
let smallest_by = v.iter().cloned().k_smallest_relaxed_by(k, Ord::cmp);
let smallest_by_key = v.iter().cloned().k_smallest_relaxed_by_key(k, |&x| x);

let sorted_largest = sorted[sorted.len() - num_elements..].iter().rev().cloned();
let largest = v.iter().cloned().k_largest_relaxed(k);
let largest_by = v.iter().cloned().k_largest_relaxed_by(k, Ord::cmp);
let largest_by_key = v.iter().cloned().k_largest_relaxed_by_key(k, |&x| x);

// Check the variations produce the same answers and that they're right
it::assert_equal(smallest, sorted_smallest.clone());
it::assert_equal(smallest_by, sorted_smallest.clone());
it::assert_equal(smallest_by_key, sorted_smallest);

it::assert_equal(largest, sorted_largest.clone());
it::assert_equal(largest_by, sorted_largest.clone());
it::assert_equal(largest_by_key, sorted_largest);
}
}

#[derive(Clone, Debug)]
Expand Down Expand Up @@ -571,8 +607,11 @@ where
I::Item: Ord + Debug,
{
let j = i.clone();
let i1 = i.clone();
let j1 = i.clone();
let k = k as usize;
it::assert_equal(i.k_smallest(k), j.sorted().take(k))
it::assert_equal(i.k_smallest(k), j.sorted().take(k));
it::assert_equal(i1.k_smallest_relaxed(k), j1.sorted().take(k));
}

// Similar to `k_smallest_sort` but for our custom heap implementation.
Expand All @@ -582,8 +621,11 @@ where
I::Item: Ord + Debug,
{
let j = i.clone();
let i1 = i.clone();
let j1 = i.clone();
let k = k as usize;
it::assert_equal(i.k_smallest_by(k, Ord::cmp), j.sorted().take(k))
it::assert_equal(i.k_smallest_by(k, Ord::cmp), j.sorted().take(k));
it::assert_equal(i1.k_smallest_relaxed_by(k, Ord::cmp), j1.sorted().take(k));
}

macro_rules! generic_test {
Expand Down