nodejs · May 16, 2024
diff --git a/‎deps/zlib/BUILD.gn
+30 b/‎deps/zlib/BUILD.gn
+30
diff --git a/‎deps/zlib/CMakeLists.txt
+33-4 b/‎deps/zlib/CMakeLists.txt
+33-4
diff --git a/‎deps/zlib/adler32_simd.c
+76-90 b/‎deps/zlib/adler32_simd.c
+76-90
diff --git a/‎deps/zlib/contrib/optimizations/chunkcopy.h
+75 b/‎deps/zlib/contrib/optimizations/chunkcopy.h
+75
@@ -441,6 +441,36 @@ executable("zlib_bench") {
   configs += [ "//build/config/compiler:no_chromium_code" ]
 }
 
+executable("minigzip") {
+  include_dirs = [ "." ]
+
+  sources = [ "test/minigzip.c" ]
+  if (!is_debug) {
+    configs -= [ "//build/config/compiler:default_optimization" ]
+    configs += [ "//build/config/compiler:optimize_speed" ]
+  }
+
+  deps = [ ":zlib" ]
+
+  configs -= [ "//build/config/compiler:chromium_code" ]
+  configs += [ "//build/config/compiler:no_chromium_code" ]
+}
+
+executable("zpipe") {
+  include_dirs = [ "." ]
+
+  sources = [ "examples/zpipe.c" ]
+  if (!is_debug) {
+    configs -= [ "//build/config/compiler:default_optimization" ]
+    configs += [ "//build/config/compiler:optimize_speed" ]
+  }
+
+  deps = [ ":zlib" ]
+
+  configs -= [ "//build/config/compiler:chromium_code" ]
+  configs += [ "//build/config/compiler:no_chromium_code" ]
+}
+
 if (!is_win || target_os != "winuwp") {
   executable("minizip_bin") {
     include_dirs = [ "." ]
 
@@ -26,6 +26,8 @@ option(ENABLE_SIMD_AVX512 "Enable SIMD AXV512 optimizations" OFF)
 option(USE_ZLIB_RABIN_KARP_HASH "Enable bitstream compatibility with canonical zlib" OFF)
 option(BUILD_UNITTESTS "Enable standalone unit tests build" OFF)
 option(BUILD_MINIZIP_BIN "Enable building minzip_bin tool" OFF)
+option(BUILD_ZPIPE "Enable building zpipe tool" OFF)
+option(BUILD_MINIGZIP "Enable building minigzip tool" OFF)
 
 if (USE_ZLIB_RABIN_KARP_HASH)
    add_definitions(-DUSE_ZLIB_RABIN_KARP_ROLLING_HASH)
@@ -79,9 +81,16 @@ if (ENABLE_SIMD_OPTIMIZATIONS)
     add_definitions(-DRISCV_RVV)
     add_definitions(-DDEFLATE_SLIDE_HASH_RVV)
     add_definitions(-DADLER32_SIMD_RVV)
-    #TODO(cavalcantii): add remaining flags as we port optimizations to RVV.
-    # Required by CPU features detection code.
-    SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} --target=riscv64-unknown-linux-gnu -march=rv64gcv")
+
+    # TODO(cavalcantii): add remaining flags as we port optimizations to RVV.
+    # chunk_copy is required for READ64 and unconditional decode of literals.
+    add_definitions(-DINFLATE_CHUNK_GENERIC)
+    add_definitions(-DINFLATE_CHUNK_READ_64LE)
+
+    # Tested with clang-17, unaligned loads are required by read64 & chunk_copy.
+    # TODO(cavalcantii): replace internal clang flags for -munaligned-access
+    # when we have a newer compiler available.
+    SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} --target=riscv64-unknown-linux-gnu -march=rv64gcv -Xclang -target-feature -Xclang +unaligned-scalar-mem")
   endif()
 
 endif()
@@ -192,9 +201,14 @@ set(ZLIB_SRCS
 if (ENABLE_SIMD_OPTIMIZATIONS)
   if (CMAKE_SYSTEM_PROCESSOR STREQUAL "riscv64")
     message("RISCVV: Add optimizations.")
+    list(REMOVE_ITEM ZLIB_SRCS inflate.c)
     list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/adler32_simd.h)
+    list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/contrib/optimizations/chunkcopy.h)
     list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/cpu_features.h)
+
     list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/adler32_simd.c)
+    list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/contrib/optimizations/inffast_chunk.c)
+    list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/contrib/optimizations/inflate.c)
     list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/cpu_features.c)
   else()
     list(REMOVE_ITEM ZLIB_SRCS inflate.c)
@@ -339,7 +353,7 @@ if (BUILD_UNITTESTS)
 endif()
 
 #============================================================================
-# Minigzip tool
+# Minizip tool
 #============================================================================
 # TODO(cavalcantii): get it working on Windows.
 if (BUILD_MINIZIP_BIN)
@@ -349,3 +363,18 @@ if (BUILD_MINIZIP_BIN)
     )
   target_link_libraries(minizip_bin zlib)
 endif()
+
+#============================================================================
+# zpipe tool
+#============================================================================
+if (BUILD_ZPIPE)
+  add_executable(zpipe examples/zpipe.c)
+  target_link_libraries(zpipe zlib)
+endif()
+#============================================================================
+# MiniGzip tool
+#============================================================================
+if (BUILD_MINIGZIP)
+  add_executable(minigzip_bin test/minigzip.c)
+  target_link_libraries(minigzip_bin zlib)
+endif()
@@ -41,9 +41,6 @@
  * [2] zlib adler32_z() uses this fact to implement NMAX-block-based updates
  * of the adler s1 s2 of uint32_t type (see adler32.c).
  */
-/* Copyright (C) 2023 SiFive, Inc. All rights reserved.
- * For conditions of distribution and use, see copyright notice in zlib.h
- */
 
 #include "adler32_simd.h"
 
@@ -368,103 +365,92 @@ uint32_t ZLIB_INTERNAL adler32_simd_(  /* NEON */
 
 #elif defined(ADLER32_SIMD_RVV)
 #include <riscv_vector.h>
-/* adler32_rvv.c - RVV version of Adler-32
- * RVV 1.0 code contributed by Alex Chiang <alex.chiang@sifive.com>
- * on https://github.com/zlib-ng/zlib-ng/pull/1532
- * Port from Simon Hosie's fork:
- * https://github.com/cloudflare/zlib/commit/40688b53c61cb9bfc36471acd2dc0800b7ebcab1
+
+/*
+ * Patch by Simon Hosie, from:
+ *    https://github.com/cloudflare/zlib/pull/55
  */
 
 uint32_t ZLIB_INTERNAL adler32_simd_(  /* RVV */
     uint32_t adler,
     const unsigned char *buf,
     unsigned long len)
 {
-    /* split Adler-32 into component sums */
-    uint32_t sum2 = (adler >> 16) & 0xffff;
-    adler &= 0xffff;
-
-    size_t left = len;
-    size_t vl = __riscv_vsetvlmax_e8m1();
-    vl = vl > 256 ? 256 : vl;
-    vuint32m4_t v_buf32_accu = __riscv_vmv_v_x_u32m4(0, vl);
-    vuint32m4_t v_adler32_prev_accu = __riscv_vmv_v_x_u32m4(0, vl);
-    vuint16m2_t v_buf16_accu;
-
-    /*
-     * We accumulate 8-bit data, and to prevent overflow, we have to use a 32-bit accumulator.
-     * However, adding 8-bit data into a 32-bit accumulator isn't efficient. We use 16-bit & 32-bit
-     * accumulators to boost performance.
-     *
-     * The block_size is the largest multiple of vl that <= 256, because overflow would occur when
-     * vl > 256 (255 * 256 <= UINT16_MAX).
-     *
-     * We accumulate 8-bit data into a 16-bit accumulator and then
-     * move the data into the 32-bit accumulator at the last iteration.
+  size_t vl = __riscv_vsetvlmax_e8m2();
+  const vuint16m4_t zero16 = __riscv_vmv_v_x_u16m4(0, vl);
+  vuint16m4_t a_sum = zero16;
+  vuint32m8_t b_sum = __riscv_vmv_v_x_u32m8(0, vl);
+
+  /* Deal with the part which is not a multiple of vl first; because it's
+   * easier to zero-stuff the beginning of the checksum than it is to tweak the
+   * multipliers and sums for odd lengths afterwards.
+   */
+  size_t head = len & (vl - 1);
+  if (head > 0) {
+    vuint8m2_t zero8 = __riscv_vmv_v_x_u8m2(0, vl);
+    vuint8m2_t in = __riscv_vle8_v_u8m2(buf, vl);
+    in = __riscv_vslideup(zero8, in, vl - head, vl);
+    vuint16m4_t in16 = __riscv_vwcvtu_x(in, vl);
+    a_sum = in16;
+    buf += head;
+  }
+
+  /* We have a 32-bit accumulator, and in each iteration we add 22-times a
+   * 16-bit value, plus another 16-bit value.  We periodically subtract up to
+   * 65535 times BASE to avoid overflow.  b_overflow estimates how often we
+   * need to do this subtraction.
+   */
+  const int b_overflow = BASE / 23;
+  int fixup = b_overflow;
+  ssize_t iters = (len - head) / vl;
+  while (iters > 0) {
+    const vuint16m4_t a_overflow = __riscv_vrsub(a_sum, BASE, vl);
+    int batch = iters < 22 ? iters : 22;
+    iters -= batch;
+    b_sum = __riscv_vwmaccu(b_sum, batch, a_sum, vl);
+    vuint16m4_t a_batch = zero16, b_batch = zero16;
+
+    /* Do a short batch, where neither a_sum nor b_sum can overflow a 16-bit
+     * register.  Then add them back into the main accumulators.
      */
-    size_t block_size = (256 / vl) * vl;
-    size_t nmax_limit = (NMAX / block_size);
-    size_t cnt = 0;
-    while (left >= block_size) {
-        v_buf16_accu = __riscv_vmv_v_x_u16m2(0, vl);
-        size_t subprob = block_size;
-        while (subprob > 0) {
-            vuint8m1_t v_buf8 = __riscv_vle8_v_u8m1(buf, vl);
-            v_adler32_prev_accu = __riscv_vwaddu_wv_u32m4(v_adler32_prev_accu, v_buf16_accu, vl);
-            v_buf16_accu = __riscv_vwaddu_wv_u16m2(v_buf16_accu, v_buf8, vl);
-            buf += vl;
-            subprob -= vl;
-        }
-        v_adler32_prev_accu = __riscv_vmacc_vx_u32m4(v_adler32_prev_accu, block_size / vl, v_buf32_accu, vl);
-        v_buf32_accu = __riscv_vwaddu_wv_u32m4(v_buf32_accu, v_buf16_accu, vl);
-        left -= block_size;
-        /* do modulo once each block of NMAX size */
-        if (++cnt >= nmax_limit) {
-            v_adler32_prev_accu = __riscv_vremu_vx_u32m4(v_adler32_prev_accu, BASE, vl);
-            cnt = 0;
-        }
+    while (batch-- > 0) {
+      vuint8m2_t in8 = __riscv_vle8_v_u8m2(buf, vl);
+      buf += vl;
+      b_batch = __riscv_vadd(b_batch, a_batch, vl);
+      a_batch = __riscv_vwaddu_wv(a_batch, in8, vl);
     }
-    /* the left len <= 256 now, we can use 16-bit accum safely */
-    v_buf16_accu = __riscv_vmv_v_x_u16m2(0, vl);
-    size_t res = left;
-    while (left >= vl) {
-        vuint8m1_t v_buf8 = __riscv_vle8_v_u8m1(buf, vl);
-        v_adler32_prev_accu = __riscv_vwaddu_wv_u32m4(v_adler32_prev_accu, v_buf16_accu, vl);
-        v_buf16_accu = __riscv_vwaddu_wv_u16m2(v_buf16_accu, v_buf8, vl);
-        buf += vl;
-        left -= vl;
+    vbool4_t ov = __riscv_vmsgeu(a_batch, a_overflow, vl);
+    a_sum = __riscv_vadd(a_sum, a_batch, vl);
+    a_sum = __riscv_vadd_mu(ov, a_sum, a_sum, 65536 - BASE, vl);
+    b_sum = __riscv_vwaddu_wv(b_sum, b_batch, vl);
+    if (--fixup <= 0) {
+      b_sum = __riscv_vnmsac(b_sum, BASE, __riscv_vsrl(b_sum, 16, vl), vl);
+      fixup = b_overflow;
     }
-    v_adler32_prev_accu = __riscv_vmacc_vx_u32m4(v_adler32_prev_accu, res / vl, v_buf32_accu, vl);
-    v_adler32_prev_accu = __riscv_vremu_vx_u32m4(v_adler32_prev_accu, BASE, vl);
-    v_buf32_accu = __riscv_vwaddu_wv_u32m4(v_buf32_accu, v_buf16_accu, vl);
-
-    vuint32m4_t v_seq = __riscv_vid_v_u32m4(vl);
-    vuint32m4_t v_rev_seq = __riscv_vrsub_vx_u32m4(v_seq, vl, vl);
-    vuint32m4_t v_sum32_accu = __riscv_vmul_vv_u32m4(v_buf32_accu, v_rev_seq, vl);
-
-    v_sum32_accu = __riscv_vadd_vv_u32m4(v_sum32_accu, __riscv_vmul_vx_u32m4(v_adler32_prev_accu, vl, vl), vl);
-
-    vuint32m1_t v_sum2_sum = __riscv_vmv_s_x_u32m1(0, vl);
-    v_sum2_sum = __riscv_vredsum_vs_u32m4_u32m1(v_sum32_accu, v_sum2_sum, vl);
-    uint32_t sum2_sum = __riscv_vmv_x_s_u32m1_u32(v_sum2_sum);
-
-    sum2 += (sum2_sum + adler * (len - left));
-
-    vuint32m1_t v_adler_sum = __riscv_vmv_s_x_u32m1(0, vl);
-    v_adler_sum = __riscv_vredsum_vs_u32m4_u32m1(v_buf32_accu, v_adler_sum, vl);
-    uint32_t adler_sum = __riscv_vmv_x_s_u32m1_u32(v_adler_sum);
-
-    adler += adler_sum;
-
-    while (left--) {
-        adler += *buf++;
-        sum2 += adler;
-    }
-
-    sum2 %= BASE;
-    adler %= BASE;
-
-    return adler | (sum2 << 16);
+  }
+  /* Adjust per-lane sums to have appropriate offsets from the end of the
+   * buffer.
+   */
+  const vuint16m4_t off = __riscv_vrsub(__riscv_vid_v_u16m4(vl), vl, vl);
+  vuint16m4_t bsum16 = __riscv_vncvt_x(__riscv_vremu(b_sum, BASE, vl), vl);
+  b_sum = __riscv_vadd(__riscv_vwmulu(a_sum, off, vl),
+                       __riscv_vwmulu(bsum16, vl, vl), vl);
+  bsum16 = __riscv_vncvt_x(__riscv_vremu(b_sum, BASE, vl), vl);
+
+  /* And finally, do a horizontal sum across the registers for the final
+   * result.
+   */
+  uint32_t a = adler & 0xffff;
+  uint32_t b = ((adler >> 16) + a * (len % BASE)) % BASE;
+  vuint32m1_t sca = __riscv_vmv_v_x_u32m1(a, 1);
+  vuint32m1_t scb = __riscv_vmv_v_x_u32m1(b, 1);
+  sca = __riscv_vwredsumu(a_sum, sca, vl);
+  scb = __riscv_vwredsumu(bsum16, scb, vl);
+  a = __riscv_vmv_x(sca);
+  b = __riscv_vmv_x(scb);
+  a %= BASE;
+  b %= BASE;
+  return (b << 16) | a;
 }
 
 #endif  /* ADLER32_SIMD_SSSE3 */
@@ -21,8 +21,10 @@
 
 #if defined(__clang__) || defined(__GNUC__) || defined(__llvm__)
 #define Z_BUILTIN_MEMCPY __builtin_memcpy
+#define Z_BUILTIN_MEMSET __builtin_memset
 #else
 #define Z_BUILTIN_MEMCPY zmemcpy
+#define Z_BUILTIN_MEMSET zmemset
 #endif
 
 #if defined(INFLATE_CHUNK_SIMD_NEON)
@@ -31,6 +33,8 @@ typedef uint8x16_t z_vec128i_t;
 #elif defined(INFLATE_CHUNK_SIMD_SSE2)
 #include <emmintrin.h>
 typedef __m128i z_vec128i_t;
+#elif defined(INFLATE_CHUNK_GENERIC)
+typedef struct { uint8_t x[16]; } z_vec128i_t;
 #else
 #error chunkcopy.h inflate chunk SIMD is not defined for your build target
 #endif
@@ -265,6 +269,77 @@ static inline z_vec128i_t v_load8_dup(const void* src) {
 static inline void v_store_128(void* out, const z_vec128i_t vec) {
   _mm_storeu_si128((__m128i*)out, vec);
 }
+#elif defined(INFLATE_CHUNK_GENERIC)
+/*
+ * Default implementations for chunk-copy functions rely on memcpy() being
+ * inlined by the compiler for best performance.  This is most likely to work
+ * as expected when the length argument is constant (as is the case here) and
+ * the target supports unaligned loads and stores.  Since that's not always a
+ * safe assumption, this may need extra compiler arguments such as
+ * `-mno-strict-align` or `-munaligned-access`, or the availability of
+ * extensions like SIMD.
+ */
+
+/*
+ * v_load64_dup(): load *src as an unaligned 64-bit int and duplicate it in
+ * every 64-bit component of the 128-bit result (64-bit int splat).
+ */
+static inline z_vec128i_t v_load64_dup(const void* src) {
+  int64_t in;
+  Z_BUILTIN_MEMCPY(&in, src, sizeof(in));
+  z_vec128i_t out;
+  for (int i = 0; i < sizeof(out); i += sizeof(in)) {
+    Z_BUILTIN_MEMCPY((uint8_t*)&out + i, &in, sizeof(in));
+  }
+  return out;
+}
+
+/*
+ * v_load32_dup(): load *src as an unaligned 32-bit int and duplicate it in
+ * every 32-bit component of the 128-bit result (32-bit int splat).
+ */
+static inline z_vec128i_t v_load32_dup(const void* src) {
+  int32_t in;
+  Z_BUILTIN_MEMCPY(&in, src, sizeof(in));
+  z_vec128i_t out;
+  for (int i = 0; i < sizeof(out); i += sizeof(in)) {
+    Z_BUILTIN_MEMCPY((uint8_t*)&out + i, &in, sizeof(in));
+  }
+  return out;
+}
+
+/*
+ * v_load16_dup(): load *src as an unaligned 16-bit int and duplicate it in
+ * every 16-bit component of the 128-bit result (16-bit int splat).
+ */
+static inline z_vec128i_t v_load16_dup(const void* src) {
+  int16_t in;
+  Z_BUILTIN_MEMCPY(&in, src, sizeof(in));
+  z_vec128i_t out;
+  for (int i = 0; i < sizeof(out); i += sizeof(in)) {
+    Z_BUILTIN_MEMCPY((uint8_t*)&out + i, &in, sizeof(in));
+  }
+  return out;
+}
+
+/*
+ * v_load8_dup(): load the 8-bit int *src and duplicate it in every 8-bit
+ * component of the 128-bit result (8-bit int splat).
+ */
+static inline z_vec128i_t v_load8_dup(const void* src) {
+  int8_t in = *(const uint8_t*)src;
+  z_vec128i_t out;
+  Z_BUILTIN_MEMSET(&out, in, sizeof(out));
+  return out;
+}
+
+/*
+ * v_store_128(): store the 128-bit vec in a memory destination (that might
+ * not be 16-byte aligned) void* out.
+ */
+static inline void v_store_128(void* out, const z_vec128i_t vec) {
+  Z_BUILTIN_MEMCPY(out, &vec, sizeof(vec));
+}
 #endif
 
 /*