From 086d897edc81915a2626521160a2322576373a10 Mon Sep 17 00:00:00 2001
From: chyyran <ronny@ronnychan.ca>
Date: Sat, 6 Aug 2022 03:27:42 -0400
Subject: [PATCH] Reduce unnecessary allocations and indirections

* Changed literal_probs array from a Vec<Vec<u16>> to a Vec2D backed by a contiguous allocation
* BitTrees in LenDecoder and DecoderState are now stored inline. The actual BitTree data still
  lives in a Vec but one level of indirection is reduced.
* Don't bother with filling stack-allocated DecoderState arrays on reset, and just recreate the
  arrays dropping the existing ones.
---
 src/decode/lzma.rs       |  77 ++++++++++---------
 src/decode/rangecoder.rs |  42 +++++++++-
 src/lib.rs               |   1 +
 src/util/mod.rs          |   1 +
 src/util/vec2d.rs        | 161 +++++++++++++++++++++++++++++++++++++++
 5 files changed, 242 insertions(+), 40 deletions(-)
 create mode 100644 src/util/mod.rs
 create mode 100644 src/util/vec2d.rs
diff --git a/src/decode/lzma.rs b/src/decode/lzma.rs
index 036d3f2..7d1d5b3 100644
--- a/src/decode/lzma.rs
+++ b/src/decode/lzma.rs
@@ -1,8 +1,8 @@
 use crate::decode::lzbuffer::{LzBuffer, LzCircularBuffer};
-use crate::decode::rangecoder;
-use crate::decode::rangecoder::RangeDecoder;
+use crate::decode::rangecoder::{BitTree, LenDecoder, RangeDecoder};
 use crate::decompress::{Options, UnpackedSize};
 use crate::error;
+use crate::util::vec2d::Vec2D;
 use byteorder::{LittleEndian, ReadBytesExt};
 use std::io;
 
@@ -166,9 +166,9 @@ pub(crate) struct DecoderState {
     partial_input_buf: std::io::Cursor<[u8; MAX_REQUIRED_INPUT]>,
     pub(crate) lzma_props: LzmaProperties,
     unpacked_size: Option<u64>,
-    literal_probs: Vec<Vec<u16>>,
-    pos_slot_decoder: Vec<rangecoder::BitTree>,
-    align_decoder: rangecoder::BitTree,
+    literal_probs: Vec2D<u16>,
+    pos_slot_decoder: [BitTree; 4],
+    align_decoder: BitTree,
     pos_decoders: [u16; 115],
     is_match: [u16; 192], // true = LZ, false = literal
     is_rep: [u16; 12],
@@ -178,8 +178,8 @@ pub(crate) struct DecoderState {
     is_rep_0long: [u16; 192],
     state: usize,
     rep: [usize; 4],
-    len_decoder: rangecoder::LenDecoder,
-    rep_len_decoder: rangecoder::LenDecoder,
+    len_decoder: LenDecoder,
+    rep_len_decoder: LenDecoder,
 }
 
 impl DecoderState {
@@ -189,9 +189,14 @@ impl DecoderState {
             partial_input_buf: std::io::Cursor::new([0; MAX_REQUIRED_INPUT]),
             lzma_props,
             unpacked_size,
-            literal_probs: vec![vec![0x400; 0x300]; 1 << (lzma_props.lc + lzma_props.lp)],
-            pos_slot_decoder: vec![rangecoder::BitTree::new(6); 4],
-            align_decoder: rangecoder::BitTree::new(4),
+            literal_probs: Vec2D::init(0x400, (1 << (lzma_props.lc + lzma_props.lp), 0x300)),
+            pos_slot_decoder: [
+                BitTree::new(6),
+                BitTree::new(6),
+                BitTree::new(6),
+                BitTree::new(6),
+            ],
+            align_decoder: BitTree::new(4),
             pos_decoders: [0x400; 115],
             is_match: [0x400; 192],
             is_rep: [0x400; 12],
@@ -201,8 +206,8 @@ impl DecoderState {
             is_rep_0long: [0x400; 192],
             state: 0,
             rep: [0; 4],
-            len_decoder: rangecoder::LenDecoder::new(),
-            rep_len_decoder: rangecoder::LenDecoder::new(),
+            len_decoder: LenDecoder::new(),
+            rep_len_decoder: LenDecoder::new(),
         }
     }
 
@@ -210,24 +215,27 @@ impl DecoderState {
         new_props.validate();
         if self.lzma_props.lc + self.lzma_props.lp == new_props.lc + new_props.lp {
             // We can reset here by filling the existing buffer with 0x400.
-            self.literal_probs.iter_mut().for_each(|v| v.fill(0x400))
+            self.literal_probs.fill(0x400);
         } else {
             // We need to reallocate because of the new size of `lc+lp`.
-            self.literal_probs = vec![vec![0x400; 0x300]; 1 << (new_props.lc + new_props.lp)];
+            self.literal_probs = Vec2D::init(0x400, (1 << (new_props.lc + new_props.lp), 0x300));
         }
 
         self.lzma_props = new_props;
         self.pos_slot_decoder.iter_mut().for_each(|t| t.reset());
         self.align_decoder.reset();
-        self.pos_decoders.fill(0x400);
-        self.is_match.fill(0x400);
-        self.is_rep.fill(0x400);
-        self.is_rep_g0.fill(0x400);
-        self.is_rep_g1.fill(0x400);
-        self.is_rep_g2.fill(0x400);
-        self.is_rep_0long.fill(0x400);
+        // For stack-allocated arrays, it was found to be faster to re-create new arrays
+        // dropping the existing one, rather than using `fill` to reset the contents to zero.
+        // Heap-based arrays use fill to keep their allocation rather than reallocate.
+        self.pos_decoders = [0x400; 115];
+        self.is_match = [0x400; 192];
+        self.is_rep = [0x400; 12];
+        self.is_rep_g0 = [0x400; 12];
+        self.is_rep_g1 = [0x400; 12];
+        self.is_rep_g2 = [0x400; 12];
+        self.is_rep_0long = [0x400; 192];
         self.state = 0;
-        self.rep.fill(0);
+        self.rep = [0; 4];
         self.len_decoder.reset();
         self.rep_len_decoder.reset();
     }
@@ -239,7 +247,7 @@ impl DecoderState {
     pub fn process<'a, W: io::Write, LZB: LzBuffer<W>, R: io::BufRead>(
         &mut self,
         output: &mut LZB,
-        rangecoder: &mut rangecoder::RangeDecoder<'a, R>,
+        rangecoder: &mut RangeDecoder<'a, R>,
     ) -> error::Result<()> {
         self.process_mode(output, rangecoder, ProcessingMode::Finish)
     }
@@ -248,7 +256,7 @@ impl DecoderState {
     pub fn process_stream<'a, W: io::Write, LZB: LzBuffer<W>, R: io::BufRead>(
         &mut self,
         output: &mut LZB,
-        rangecoder: &mut rangecoder::RangeDecoder<'a, R>,
+        rangecoder: &mut RangeDecoder<'a, R>,
     ) -> error::Result<()> {
         self.process_mode(output, rangecoder, ProcessingMode::Partial)
     }
@@ -262,7 +270,7 @@ impl DecoderState {
     fn process_next_inner<'a, W: io::Write, LZB: LzBuffer<W>, R: io::BufRead>(
         &mut self,
         output: &mut LZB,
-        rangecoder: &mut rangecoder::RangeDecoder<'a, R>,
+        rangecoder: &mut RangeDecoder<'a, R>,
         update: bool,
     ) -> error::Result<ProcessingStatus> {
         let pos_state = output.len() & ((1 << self.lzma_props.pb) - 1);
@@ -379,7 +387,7 @@ impl DecoderState {
     fn process_next<'a, W: io::Write, LZB: LzBuffer<W>, R: io::BufRead>(
         &mut self,
         output: &mut LZB,
-        rangecoder: &mut rangecoder::RangeDecoder<'a, R>,
+        rangecoder: &mut RangeDecoder<'a, R>,
     ) -> error::Result<ProcessingStatus> {
         self.process_next_inner(output, rangecoder, true)
     }
@@ -397,7 +405,7 @@ impl DecoderState {
         code: u32,
     ) -> error::Result<()> {
         let mut temp = std::io::Cursor::new(buf);
-        let mut rangecoder = rangecoder::RangeDecoder::from_parts(&mut temp, range, code);
+        let mut rangecoder = RangeDecoder::from_parts(&mut temp, range, code);
         let _ = self.process_next_inner(output, &mut rangecoder, false)?;
         Ok(())
     }
@@ -405,7 +413,7 @@ impl DecoderState {
     /// Utility function to read data into the partial input buffer.
     fn read_partial_input_buf<'a, R: io::BufRead>(
         &mut self,
-        rangecoder: &mut rangecoder::RangeDecoder<'a, R>,
+        rangecoder: &mut RangeDecoder<'a, R>,
     ) -> error::Result<()> {
         // Fill as much of the tmp buffer as possible
         let start = self.partial_input_buf.position() as usize;
@@ -419,7 +427,7 @@ impl DecoderState {
     fn process_mode<'a, W: io::Write, LZB: LzBuffer<W>, R: io::BufRead>(
         &mut self,
         output: &mut LZB,
-        rangecoder: &mut rangecoder::RangeDecoder<'a, R>,
+        rangecoder: &mut RangeDecoder<'a, R>,
         mode: ProcessingMode,
     ) -> error::Result<()> {
         loop {
@@ -460,11 +468,8 @@ impl DecoderState {
                 // Run the decompressor on the tmp buffer
                 let mut tmp_reader =
                     io::Cursor::new(&tmp[..self.partial_input_buf.position() as usize]);
-                let mut tmp_rangecoder = rangecoder::RangeDecoder::from_parts(
-                    &mut tmp_reader,
-                    rangecoder.range,
-                    rangecoder.code,
-                );
+                let mut tmp_rangecoder =
+                    RangeDecoder::from_parts(&mut tmp_reader, rangecoder.range, rangecoder.code);
                 let res = self.process_next(output, &mut tmp_rangecoder)?;
 
                 // Update the actual rangecoder
@@ -513,7 +518,7 @@ impl DecoderState {
     fn decode_literal<'a, W: io::Write, LZB: LzBuffer<W>, R: io::BufRead>(
         &mut self,
         output: &mut LZB,
-        rangecoder: &mut rangecoder::RangeDecoder<'a, R>,
+        rangecoder: &mut RangeDecoder<'a, R>,
         update: bool,
     ) -> error::Result<u8> {
         let def_prev_byte = 0u8;
@@ -549,7 +554,7 @@ impl DecoderState {
 
     fn decode_distance<'a, R: io::BufRead>(
         &mut self,
-        rangecoder: &mut rangecoder::RangeDecoder<'a, R>,
+        rangecoder: &mut RangeDecoder<'a, R>,
         length: usize,
         update: bool,
     ) -> error::Result<usize> {
diff --git a/src/decode/rangecoder.rs b/src/decode/rangecoder.rs
index 4cee3ad..52271f9 100644
--- a/src/decode/rangecoder.rs
+++ b/src/decode/rangecoder.rs
@@ -190,8 +190,8 @@ impl BitTree {
 pub struct LenDecoder {
     choice: u16,
     choice2: u16,
-    low_coder: Vec<BitTree>,
-    mid_coder: Vec<BitTree>,
+    low_coder: [BitTree; 16],
+    mid_coder: [BitTree; 16],
     high_coder: BitTree,
 }
 
@@ -200,8 +200,42 @@ impl LenDecoder {
         LenDecoder {
             choice: 0x400,
             choice2: 0x400,
-            low_coder: vec![BitTree::new(3); 16],
-            mid_coder: vec![BitTree::new(3); 16],
+            low_coder: [
+                BitTree::new(3),
+                BitTree::new(3),
+                BitTree::new(3),
+                BitTree::new(3),
+                BitTree::new(3),
+                BitTree::new(3),
+                BitTree::new(3),
+                BitTree::new(3),
+                BitTree::new(3),
+                BitTree::new(3),
+                BitTree::new(3),
+                BitTree::new(3),
+                BitTree::new(3),
+                BitTree::new(3),
+                BitTree::new(3),
+                BitTree::new(3),
+            ],
+            mid_coder: [
+                BitTree::new(3),
+                BitTree::new(3),
+                BitTree::new(3),
+                BitTree::new(3),
+                BitTree::new(3),
+                BitTree::new(3),
+                BitTree::new(3),
+                BitTree::new(3),
+                BitTree::new(3),
+                BitTree::new(3),
+                BitTree::new(3),
+                BitTree::new(3),
+                BitTree::new(3),
+                BitTree::new(3),
+                BitTree::new(3),
+                BitTree::new(3),
+            ],
             high_coder: BitTree::new(8),
         }
     }
diff --git a/src/lib.rs b/src/lib.rs
index 7bd7730..f8bac25 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -12,6 +12,7 @@ mod encode;
 
 pub mod error;
 
+mod util;
 mod xz;
 
 use std::io;
diff --git a/src/util/mod.rs b/src/util/mod.rs
new file mode 100644
index 0000000..ee2474e
--- /dev/null
+++ b/src/util/mod.rs
@@ -0,0 +1 @@
+pub mod vec2d;
diff --git a/src/util/vec2d.rs b/src/util/vec2d.rs
new file mode 100644
index 0000000..0b476e1
--- /dev/null
+++ b/src/util/vec2d.rs
@@ -0,0 +1,161 @@
+use std::ops::{Index, IndexMut};
+
+/// A 2 dimensional matrix in row-major order backed by a contiguous slice.
+#[derive(Debug)]
+pub struct Vec2D<T> {
+    data: Box<[T]>,
+    cols: usize,
+}
+
+impl<T> Vec2D<T> {
+    /// Initialize a grid of size (`rows`, `cols`) with the given data element.
+    pub fn init(data: T, size: (usize, usize)) -> Vec2D<T>
+    where
+        T: Clone,
+    {
+        let (rows, cols) = size;
+        let len = rows.checked_mul(cols).expect(&format!(
+            "{} rows by {} cols exceeds usize::MAX",
+            rows, cols
+        ));
+        Vec2D {
+            data: vec![data; len].into_boxed_slice(),
+            cols,
+        }
+    }
+
+    /// Fills the grid with elements by cloning `value`.
+    pub fn fill(&mut self, value: T)
+    where
+        T: Clone,
+    {
+        self.data.fill(value)
+    }
+}
+
+impl<T> Index<usize> for Vec2D<T> {
+    type Output = [T];
+
+    #[inline]
+    fn index(&self, row: usize) -> &Self::Output {
+        let start_row = row * self.cols;
+        &self.data[start_row..start_row + self.cols]
+    }
+}
+
+impl<T> IndexMut<usize> for Vec2D<T> {
+    #[inline]
+    fn index_mut(&mut self, row: usize) -> &mut Self::Output {
+        let start_row = row * self.cols;
+        &mut self.data[start_row..start_row + self.cols]
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    #[test]
+    fn init() {
+        let vec2d = Vec2D::init(1, (2, 3));
+        assert_eq!(vec2d[0], [1, 1, 1]);
+        assert_eq!(vec2d[1], [1, 1, 1]);
+    }
+
+    #[test]
+    #[should_panic]
+    fn init_overflow() {
+        Vec2D::init(1, (usize::MAX, usize::MAX));
+    }
+
+    #[test]
+    fn fill() {
+        let mut vec2d = Vec2D::init(0, (2, 3));
+        vec2d.fill(7);
+        assert_eq!(vec2d[0], [7, 7, 7]);
+        assert_eq!(vec2d[1], [7, 7, 7]);
+    }
+
+    #[test]
+    fn index() {
+        let vec2d = Vec2D {
+            data: vec![0, 1, 2, 3, 4, 5, 6, 7].into_boxed_slice(),
+            cols: 2,
+        };
+        assert_eq!(vec2d[0], [0, 1]);
+        assert_eq!(vec2d[1], [2, 3]);
+        assert_eq!(vec2d[2], [4, 5]);
+        assert_eq!(vec2d[3], [6, 7]);
+    }
+
+    #[test]
+    fn index_mut() {
+        let mut vec2d = Vec2D {
+            data: vec![0, 1, 2, 3, 4, 5, 6, 7].into_boxed_slice(),
+            cols: 2,
+        };
+
+        vec2d[1][1] = 9;
+        assert_eq!(vec2d[0], [0, 1]);
+        // 1,1 should be 9.
+        assert_eq!(vec2d[1], [2, 9]);
+        assert_eq!(vec2d[2], [4, 5]);
+        assert_eq!(vec2d[3], [6, 7]);
+    }
+
+    #[test]
+    #[should_panic]
+    fn index_out_of_bounds() {
+        let vec2d = Vec2D::init(1, (2, 3));
+        let _x = vec2d[2][3];
+    }
+
+    #[test]
+    #[should_panic]
+    fn index_out_of_bounds_vec_edge() {
+        let vec2d = Vec2D::init(1, (2, 3));
+        let _x = vec2d[1][3];
+    }
+
+    #[test]
+    #[should_panic]
+    fn column_out_of_bounds() {
+        let vec2d = Vec2D::init(1, (2, 3));
+        let _x = vec2d[0][3];
+    }
+
+    #[test]
+    #[should_panic]
+    fn row_out_of_bounds() {
+        let vec2d = Vec2D::init(1, (2, 3));
+        let _x = vec2d[2][0];
+    }
+
+    #[test]
+    #[should_panic]
+    fn indexmut_out_of_bounds_vec_edge() {
+        let mut vec2d = Vec2D::init(1, (2, 3));
+        vec2d[1][3] = 0;
+    }
+
+    #[test]
+    #[should_panic]
+    fn mut_column_out_of_bounds() {
+        let mut vec2d = Vec2D::init(1, (2, 3));
+        vec2d[0][3] = 0;
+    }
+
+    #[test]
+    #[should_panic]
+    fn indexmut_out_of_bounds() {
+        let mut vec2d = Vec2D::init(1, (2, 3));
+        vec2d[2][3] = 0;
+    }
+
+    #[test]
+    #[should_panic]
+    fn mut_row_out_of_bounds() {
+        let mut vec2d = Vec2D::init(1, (2, 3));
+        vec2d[2][0] = 0;
+    }
+}