Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reduce unnecessary Vec allocations and indirections #77

Merged
merged 1 commit into from Sep 3, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
77 changes: 41 additions & 36 deletions src/decode/lzma.rs
@@ -1,8 +1,8 @@
use crate::decode::lzbuffer::{LzBuffer, LzCircularBuffer};
use crate::decode::rangecoder;
use crate::decode::rangecoder::RangeDecoder;
use crate::decode::rangecoder::{BitTree, LenDecoder, RangeDecoder};
use crate::decompress::{Options, UnpackedSize};
use crate::error;
use crate::util::vec2d::Vec2D;
use byteorder::{LittleEndian, ReadBytesExt};
use std::io;

Expand Down Expand Up @@ -166,9 +166,9 @@ pub(crate) struct DecoderState {
partial_input_buf: std::io::Cursor<[u8; MAX_REQUIRED_INPUT]>,
pub(crate) lzma_props: LzmaProperties,
unpacked_size: Option<u64>,
literal_probs: Vec<Vec<u16>>,
pos_slot_decoder: Vec<rangecoder::BitTree>,
align_decoder: rangecoder::BitTree,
literal_probs: Vec2D<u16>,
pos_slot_decoder: [BitTree; 4],
align_decoder: BitTree,
pos_decoders: [u16; 115],
is_match: [u16; 192], // true = LZ, false = literal
is_rep: [u16; 12],
Expand All @@ -178,8 +178,8 @@ pub(crate) struct DecoderState {
is_rep_0long: [u16; 192],
state: usize,
rep: [usize; 4],
len_decoder: rangecoder::LenDecoder,
rep_len_decoder: rangecoder::LenDecoder,
len_decoder: LenDecoder,
rep_len_decoder: LenDecoder,
}

impl DecoderState {
Expand All @@ -189,9 +189,14 @@ impl DecoderState {
partial_input_buf: std::io::Cursor::new([0; MAX_REQUIRED_INPUT]),
lzma_props,
unpacked_size,
literal_probs: vec![vec![0x400; 0x300]; 1 << (lzma_props.lc + lzma_props.lp)],
pos_slot_decoder: vec![rangecoder::BitTree::new(6); 4],
align_decoder: rangecoder::BitTree::new(4),
literal_probs: Vec2D::init(0x400, (1 << (lzma_props.lc + lzma_props.lp), 0x300)),
pos_slot_decoder: [
BitTree::new(6),
BitTree::new(6),
BitTree::new(6),
BitTree::new(6),
],
align_decoder: BitTree::new(4),
pos_decoders: [0x400; 115],
is_match: [0x400; 192],
is_rep: [0x400; 12],
Expand All @@ -201,33 +206,36 @@ impl DecoderState {
is_rep_0long: [0x400; 192],
state: 0,
rep: [0; 4],
len_decoder: rangecoder::LenDecoder::new(),
rep_len_decoder: rangecoder::LenDecoder::new(),
len_decoder: LenDecoder::new(),
rep_len_decoder: LenDecoder::new(),
}
}

pub fn reset_state(&mut self, new_props: LzmaProperties) {
new_props.validate();
if self.lzma_props.lc + self.lzma_props.lp == new_props.lc + new_props.lp {
// We can reset here by filling the existing buffer with 0x400.
self.literal_probs.iter_mut().for_each(|v| v.fill(0x400))
self.literal_probs.fill(0x400);
} else {
// We need to reallocate because of the new size of `lc+lp`.
self.literal_probs = vec![vec![0x400; 0x300]; 1 << (new_props.lc + new_props.lp)];
self.literal_probs = Vec2D::init(0x400, (1 << (new_props.lc + new_props.lp), 0x300));
}

self.lzma_props = new_props;
self.pos_slot_decoder.iter_mut().for_each(|t| t.reset());
self.align_decoder.reset();
self.pos_decoders.fill(0x400);
self.is_match.fill(0x400);
self.is_rep.fill(0x400);
self.is_rep_g0.fill(0x400);
self.is_rep_g1.fill(0x400);
self.is_rep_g2.fill(0x400);
self.is_rep_0long.fill(0x400);
// For stack-allocated arrays, it was found to be faster to re-create new arrays
// dropping the existing one, rather than using `fill` to reset the contents to zero.
// Heap-based arrays use fill to keep their allocation rather than reallocate.
self.pos_decoders = [0x400; 115];
self.is_match = [0x400; 192];
self.is_rep = [0x400; 12];
self.is_rep_g0 = [0x400; 12];
self.is_rep_g1 = [0x400; 12];
self.is_rep_g2 = [0x400; 12];
self.is_rep_0long = [0x400; 192];
self.state = 0;
self.rep.fill(0);
self.rep = [0; 4];
Comment on lines +230 to +238
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think this brings any readability benefit over fill. The compiler should be able to optimize both versions of the code regardless.

I see you however observed otherwise:

I found that it was faster to just recreate stack-allocated arrays in DecoderState than bother with filling them. Heap-backed arrays still use fill to keep their allocation.

Can you add a comment here to explain that?

(IMO, this might also be worth filing a bug to the Rust compiler, to understand if fill being slower is intentional or not a missed optimization somewhere in the compiler.)

self.len_decoder.reset();
self.rep_len_decoder.reset();
}
Expand All @@ -239,7 +247,7 @@ impl DecoderState {
pub fn process<'a, W: io::Write, LZB: LzBuffer<W>, R: io::BufRead>(
&mut self,
output: &mut LZB,
rangecoder: &mut rangecoder::RangeDecoder<'a, R>,
rangecoder: &mut RangeDecoder<'a, R>,
) -> error::Result<()> {
self.process_mode(output, rangecoder, ProcessingMode::Finish)
}
Expand All @@ -248,7 +256,7 @@ impl DecoderState {
pub fn process_stream<'a, W: io::Write, LZB: LzBuffer<W>, R: io::BufRead>(
&mut self,
output: &mut LZB,
rangecoder: &mut rangecoder::RangeDecoder<'a, R>,
rangecoder: &mut RangeDecoder<'a, R>,
) -> error::Result<()> {
self.process_mode(output, rangecoder, ProcessingMode::Partial)
}
Expand All @@ -262,7 +270,7 @@ impl DecoderState {
fn process_next_inner<'a, W: io::Write, LZB: LzBuffer<W>, R: io::BufRead>(
&mut self,
output: &mut LZB,
rangecoder: &mut rangecoder::RangeDecoder<'a, R>,
rangecoder: &mut RangeDecoder<'a, R>,
update: bool,
) -> error::Result<ProcessingStatus> {
let pos_state = output.len() & ((1 << self.lzma_props.pb) - 1);
Expand Down Expand Up @@ -379,7 +387,7 @@ impl DecoderState {
fn process_next<'a, W: io::Write, LZB: LzBuffer<W>, R: io::BufRead>(
&mut self,
output: &mut LZB,
rangecoder: &mut rangecoder::RangeDecoder<'a, R>,
rangecoder: &mut RangeDecoder<'a, R>,
) -> error::Result<ProcessingStatus> {
self.process_next_inner(output, rangecoder, true)
}
Expand All @@ -397,15 +405,15 @@ impl DecoderState {
code: u32,
) -> error::Result<()> {
let mut temp = std::io::Cursor::new(buf);
let mut rangecoder = rangecoder::RangeDecoder::from_parts(&mut temp, range, code);
let mut rangecoder = RangeDecoder::from_parts(&mut temp, range, code);
let _ = self.process_next_inner(output, &mut rangecoder, false)?;
Ok(())
}

/// Utility function to read data into the partial input buffer.
fn read_partial_input_buf<'a, R: io::BufRead>(
&mut self,
rangecoder: &mut rangecoder::RangeDecoder<'a, R>,
rangecoder: &mut RangeDecoder<'a, R>,
) -> error::Result<()> {
// Fill as much of the tmp buffer as possible
let start = self.partial_input_buf.position() as usize;
Expand All @@ -419,7 +427,7 @@ impl DecoderState {
fn process_mode<'a, W: io::Write, LZB: LzBuffer<W>, R: io::BufRead>(
&mut self,
output: &mut LZB,
rangecoder: &mut rangecoder::RangeDecoder<'a, R>,
rangecoder: &mut RangeDecoder<'a, R>,
mode: ProcessingMode,
) -> error::Result<()> {
loop {
Expand Down Expand Up @@ -460,11 +468,8 @@ impl DecoderState {
// Run the decompressor on the tmp buffer
let mut tmp_reader =
io::Cursor::new(&tmp[..self.partial_input_buf.position() as usize]);
let mut tmp_rangecoder = rangecoder::RangeDecoder::from_parts(
&mut tmp_reader,
rangecoder.range,
rangecoder.code,
);
let mut tmp_rangecoder =
RangeDecoder::from_parts(&mut tmp_reader, rangecoder.range, rangecoder.code);
let res = self.process_next(output, &mut tmp_rangecoder)?;

// Update the actual rangecoder
Expand Down Expand Up @@ -513,7 +518,7 @@ impl DecoderState {
fn decode_literal<'a, W: io::Write, LZB: LzBuffer<W>, R: io::BufRead>(
&mut self,
output: &mut LZB,
rangecoder: &mut rangecoder::RangeDecoder<'a, R>,
rangecoder: &mut RangeDecoder<'a, R>,
update: bool,
) -> error::Result<u8> {
let def_prev_byte = 0u8;
Expand Down Expand Up @@ -549,7 +554,7 @@ impl DecoderState {

fn decode_distance<'a, R: io::BufRead>(
&mut self,
rangecoder: &mut rangecoder::RangeDecoder<'a, R>,
rangecoder: &mut RangeDecoder<'a, R>,
length: usize,
update: bool,
) -> error::Result<usize> {
Expand Down
42 changes: 38 additions & 4 deletions src/decode/rangecoder.rs
Expand Up @@ -190,8 +190,8 @@ impl BitTree {
pub struct LenDecoder {
choice: u16,
choice2: u16,
low_coder: Vec<BitTree>,
mid_coder: Vec<BitTree>,
low_coder: [BitTree; 16],
mid_coder: [BitTree; 16],
high_coder: BitTree,
}

Expand All @@ -200,8 +200,42 @@ impl LenDecoder {
LenDecoder {
choice: 0x400,
choice2: 0x400,
low_coder: vec![BitTree::new(3); 16],
mid_coder: vec![BitTree::new(3); 16],
low_coder: [
BitTree::new(3),
BitTree::new(3),
BitTree::new(3),
BitTree::new(3),
BitTree::new(3),
BitTree::new(3),
BitTree::new(3),
BitTree::new(3),
BitTree::new(3),
BitTree::new(3),
BitTree::new(3),
BitTree::new(3),
BitTree::new(3),
BitTree::new(3),
BitTree::new(3),
BitTree::new(3),
],
mid_coder: [
BitTree::new(3),
BitTree::new(3),
BitTree::new(3),
BitTree::new(3),
BitTree::new(3),
BitTree::new(3),
BitTree::new(3),
BitTree::new(3),
BitTree::new(3),
BitTree::new(3),
BitTree::new(3),
BitTree::new(3),
BitTree::new(3),
BitTree::new(3),
BitTree::new(3),
BitTree::new(3),
],
high_coder: BitTree::new(8),
}
}
Expand Down
1 change: 1 addition & 0 deletions src/lib.rs
Expand Up @@ -12,6 +12,7 @@ mod encode;

pub mod error;

mod util;
mod xz;

use std::io;
Expand Down
1 change: 1 addition & 0 deletions src/util/mod.rs
@@ -0,0 +1 @@
pub mod vec2d;