Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

deps: update zlib to 1.3.0.1-motley-24c07df #52199

Merged
merged 1 commit into from Mar 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
46 changes: 32 additions & 14 deletions deps/zlib/CMakeLists.txt
Expand Up @@ -74,6 +74,16 @@ if (ENABLE_SIMD_OPTIMIZATIONS)

SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=armv8-a+crc+crypto")
endif()

if (CMAKE_SYSTEM_PROCESSOR STREQUAL "riscv64")
add_definitions(-DRISCV_RVV)
add_definitions(-DDEFLATE_SLIDE_HASH_RVV)
add_definitions(-DADLER32_SIMD_RVV)
#TODO(cavalcantii): add remaining flags as we port optimizations to RVV.
# Required by CPU features detection code.
SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} --target=riscv64-unknown-linux-gnu -march=rv64gcv")
endif()

endif()

#
Expand Down Expand Up @@ -180,20 +190,28 @@ set(ZLIB_SRCS
# Update list of source files if optimizations were enabled
#============================================================================
if (ENABLE_SIMD_OPTIMIZATIONS)
list(REMOVE_ITEM ZLIB_SRCS inflate.c)

list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/adler32_simd.h)
list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/contrib/optimizations/chunkcopy.h)
list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/contrib/optimizations/inffast_chunk.h)
list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/cpu_features.h)
list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/crc32_simd.h)

list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/adler32_simd.c)
list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/contrib/optimizations/inffast_chunk.c)
list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/contrib/optimizations/inflate.c)
list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/cpu_features.c)
list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/crc32_simd.c)
list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/crc_folding.c)
if (CMAKE_SYSTEM_PROCESSOR STREQUAL "riscv64")
message("RISCVV: Add optimizations.")
list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/adler32_simd.h)
list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/cpu_features.h)
list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/adler32_simd.c)
list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/cpu_features.c)
else()
list(REMOVE_ITEM ZLIB_SRCS inflate.c)

list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/adler32_simd.h)
list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/contrib/optimizations/chunkcopy.h)
list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/contrib/optimizations/inffast_chunk.h)
list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/cpu_features.h)
list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/crc32_simd.h)

list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/adler32_simd.c)
list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/contrib/optimizations/inffast_chunk.c)
list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/contrib/optimizations/inflate.c)
list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/cpu_features.c)
list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/crc32_simd.c)
list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/crc_folding.c)
endif()
endif()

# parse the full version number from zlib.h and include in ZLIB_FULL_VERSION
Expand Down
13 changes: 9 additions & 4 deletions deps/zlib/adler32.c
Expand Up @@ -58,20 +58,24 @@
#endif

#include "cpu_features.h"
#if defined(ADLER32_SIMD_SSSE3) || defined(ADLER32_SIMD_NEON)
#if defined(ADLER32_SIMD_SSSE3) || defined(ADLER32_SIMD_NEON) || defined(ADLER32_SIMD_RVV)
#include "adler32_simd.h"
#endif

/* ========================================================================= */
uLong ZEXPORT adler32_z(uLong adler, const Bytef *buf, z_size_t len) {
unsigned long sum2;
unsigned n;

/* TODO(cavalcantii): verify if this lengths are optimal for current CPUs. */
#if defined(ADLER32_SIMD_SSSE3) || defined(ADLER32_SIMD_NEON) \
|| defined(ADLER32_SIMD_RVV)
#if defined(ADLER32_SIMD_SSSE3)
if (buf != Z_NULL && len >= 64 && x86_cpu_enable_ssse3)
return adler32_simd_(adler, buf, len);
#elif defined(ADLER32_SIMD_NEON)
if (buf != Z_NULL && len >= 64)
#elif defined(ADLER32_SIMD_RVV)
if (buf != Z_NULL && len >= 32 && riscv_cpu_enable_rvv)
#endif
return adler32_simd_(adler, buf, len);
#endif

Expand All @@ -90,7 +94,8 @@ uLong ZEXPORT adler32_z(uLong adler, const Bytef *buf, z_size_t len) {
return adler | (sum2 << 16);
}

#if defined(ADLER32_SIMD_SSSE3) || defined(ADLER32_SIMD_NEON)
#if defined(ADLER32_SIMD_SSSE3) || defined(ADLER32_SIMD_NEON) \
|| defined(RISCV_RVV)
/*
* Use SIMD to compute the adler32. Since this function can be
* freely used, check CPU features here. zlib convention is to
Expand Down
104 changes: 104 additions & 0 deletions deps/zlib/adler32_simd.c
Expand Up @@ -41,6 +41,9 @@
* [2] zlib adler32_z() uses this fact to implement NMAX-block-based updates
* of the adler s1 s2 of uint32_t type (see adler32.c).
*/
/* Copyright (C) 2023 SiFive, Inc. All rights reserved.
* For conditions of distribution and use, see copyright notice in zlib.h
*/

#include "adler32_simd.h"

Expand Down Expand Up @@ -363,4 +366,105 @@ uint32_t ZLIB_INTERNAL adler32_simd_( /* NEON */
return s1 | (s2 << 16);
}

#elif defined(ADLER32_SIMD_RVV)
#include <riscv_vector.h>
/* adler32_rvv.c - RVV version of Adler-32
* RVV 1.0 code contributed by Alex Chiang <alex.chiang@sifive.com>
* on https://github.com/zlib-ng/zlib-ng/pull/1532
* Port from Simon Hosie's fork:
* https://github.com/cloudflare/zlib/commit/40688b53c61cb9bfc36471acd2dc0800b7ebcab1
*/

uint32_t ZLIB_INTERNAL adler32_simd_( /* RVV */
uint32_t adler,
const unsigned char *buf,
unsigned long len)
{
/* split Adler-32 into component sums */
uint32_t sum2 = (adler >> 16) & 0xffff;
adler &= 0xffff;

size_t left = len;
size_t vl = __riscv_vsetvlmax_e8m1();
vl = vl > 256 ? 256 : vl;
vuint32m4_t v_buf32_accu = __riscv_vmv_v_x_u32m4(0, vl);
vuint32m4_t v_adler32_prev_accu = __riscv_vmv_v_x_u32m4(0, vl);
vuint16m2_t v_buf16_accu;

/*
* We accumulate 8-bit data, and to prevent overflow, we have to use a 32-bit accumulator.
* However, adding 8-bit data into a 32-bit accumulator isn't efficient. We use 16-bit & 32-bit
* accumulators to boost performance.
*
* The block_size is the largest multiple of vl that <= 256, because overflow would occur when
* vl > 256 (255 * 256 <= UINT16_MAX).
*
* We accumulate 8-bit data into a 16-bit accumulator and then
* move the data into the 32-bit accumulator at the last iteration.
*/
size_t block_size = (256 / vl) * vl;
size_t nmax_limit = (NMAX / block_size);
size_t cnt = 0;
while (left >= block_size) {
v_buf16_accu = __riscv_vmv_v_x_u16m2(0, vl);
size_t subprob = block_size;
while (subprob > 0) {
vuint8m1_t v_buf8 = __riscv_vle8_v_u8m1(buf, vl);
v_adler32_prev_accu = __riscv_vwaddu_wv_u32m4(v_adler32_prev_accu, v_buf16_accu, vl);
v_buf16_accu = __riscv_vwaddu_wv_u16m2(v_buf16_accu, v_buf8, vl);
buf += vl;
subprob -= vl;
}
v_adler32_prev_accu = __riscv_vmacc_vx_u32m4(v_adler32_prev_accu, block_size / vl, v_buf32_accu, vl);
v_buf32_accu = __riscv_vwaddu_wv_u32m4(v_buf32_accu, v_buf16_accu, vl);
left -= block_size;
/* do modulo once each block of NMAX size */
if (++cnt >= nmax_limit) {
v_adler32_prev_accu = __riscv_vremu_vx_u32m4(v_adler32_prev_accu, BASE, vl);
cnt = 0;
}
}
/* the left len <= 256 now, we can use 16-bit accum safely */
v_buf16_accu = __riscv_vmv_v_x_u16m2(0, vl);
size_t res = left;
while (left >= vl) {
vuint8m1_t v_buf8 = __riscv_vle8_v_u8m1(buf, vl);
v_adler32_prev_accu = __riscv_vwaddu_wv_u32m4(v_adler32_prev_accu, v_buf16_accu, vl);
v_buf16_accu = __riscv_vwaddu_wv_u16m2(v_buf16_accu, v_buf8, vl);
buf += vl;
left -= vl;
}
v_adler32_prev_accu = __riscv_vmacc_vx_u32m4(v_adler32_prev_accu, res / vl, v_buf32_accu, vl);
v_adler32_prev_accu = __riscv_vremu_vx_u32m4(v_adler32_prev_accu, BASE, vl);
v_buf32_accu = __riscv_vwaddu_wv_u32m4(v_buf32_accu, v_buf16_accu, vl);

vuint32m4_t v_seq = __riscv_vid_v_u32m4(vl);
vuint32m4_t v_rev_seq = __riscv_vrsub_vx_u32m4(v_seq, vl, vl);
vuint32m4_t v_sum32_accu = __riscv_vmul_vv_u32m4(v_buf32_accu, v_rev_seq, vl);

v_sum32_accu = __riscv_vadd_vv_u32m4(v_sum32_accu, __riscv_vmul_vx_u32m4(v_adler32_prev_accu, vl, vl), vl);

vuint32m1_t v_sum2_sum = __riscv_vmv_s_x_u32m1(0, vl);
v_sum2_sum = __riscv_vredsum_vs_u32m4_u32m1(v_sum32_accu, v_sum2_sum, vl);
uint32_t sum2_sum = __riscv_vmv_x_s_u32m1_u32(v_sum2_sum);

sum2 += (sum2_sum + adler * (len - left));

vuint32m1_t v_adler_sum = __riscv_vmv_s_x_u32m1(0, vl);
v_adler_sum = __riscv_vredsum_vs_u32m4_u32m1(v_buf32_accu, v_adler_sum, vl);
uint32_t adler_sum = __riscv_vmv_x_s_u32m1_u32(v_adler_sum);

adler += adler_sum;

while (left--) {
adler += *buf++;
sum2 += adler;
}

sum2 %= BASE;
adler %= BASE;

return adler | (sum2 << 16);
}

#endif /* ADLER32_SIMD_SSSE3 */
32 changes: 28 additions & 4 deletions deps/zlib/cpu_features.c
Expand Up @@ -33,9 +33,13 @@ int ZLIB_INTERNAL x86_cpu_enable_ssse3 = 0;
int ZLIB_INTERNAL x86_cpu_enable_simd = 0;
int ZLIB_INTERNAL x86_cpu_enable_avx512 = 0;

int ZLIB_INTERNAL riscv_cpu_enable_rvv = 0;
int ZLIB_INTERNAL riscv_cpu_enable_vclmul = 0;

#ifndef CPU_NO_SIMD

#if defined(ARMV8_OS_ANDROID) || defined(ARMV8_OS_LINUX) || defined(ARMV8_OS_FUCHSIA) || defined(ARMV8_OS_IOS)
#if defined(ARMV8_OS_ANDROID) || defined(ARMV8_OS_LINUX) || \
defined(ARMV8_OS_FUCHSIA) || defined(ARMV8_OS_IOS)
#include <pthread.h>
#endif

Expand All @@ -62,7 +66,10 @@ int ZLIB_INTERNAL x86_cpu_enable_avx512 = 0;
static void _cpu_check_features(void);
#endif

#if defined(ARMV8_OS_ANDROID) || defined(ARMV8_OS_LINUX) || defined(ARMV8_OS_MACOS) || defined(ARMV8_OS_FUCHSIA) || defined(X86_NOT_WINDOWS) || defined(ARMV8_OS_IOS)
#if defined(ARMV8_OS_ANDROID) || defined(ARMV8_OS_LINUX) || \
defined(ARMV8_OS_MACOS) || defined(ARMV8_OS_FUCHSIA) || \
defined(X86_NOT_WINDOWS) || defined(ARMV8_OS_IOS) || \
defined(RISCV_RVV)
#if !defined(ARMV8_OS_MACOS)
// _cpu_check_features() doesn't need to do anything on mac/arm since all
// features are known at build time, so don't call it.
Expand Down Expand Up @@ -184,6 +191,23 @@ static void _cpu_check_features(void)
x86_cpu_enable_avx512 = _xgetbv(0) & 0x00000040;
#endif
}
#endif // x86 & NO_SIMD

#elif defined(RISCV_RVV)
#include <sys/auxv.h>

#ifndef ZLIB_HWCAP_RVV
#define ZLIB_HWCAP_RVV (1 << ('v' - 'a'))
#endif
#endif
#endif

/* TODO(cavalcantii)
* - add support for Android@RISCV i.e. __riscv_hwprobe().
* - detect vclmul (crypto extensions).
*/
static void _cpu_check_features(void)
{
unsigned long features = getauxval(AT_HWCAP);
riscv_cpu_enable_rvv = !!(features & ZLIB_HWCAP_RVV);
}
#endif // ARM | x86 | RISCV
#endif // NO SIMD CPU
3 changes: 3 additions & 0 deletions deps/zlib/cpu_features.h
Expand Up @@ -16,4 +16,7 @@ extern int x86_cpu_enable_ssse3;
extern int x86_cpu_enable_simd;
extern int x86_cpu_enable_avx512;

extern int riscv_cpu_enable_rvv;
extern int riscv_cpu_enable_vclmul;

void cpu_check_features(void);
6 changes: 4 additions & 2 deletions deps/zlib/crc32.c
Expand Up @@ -706,7 +706,8 @@ unsigned long ZEXPORT crc32_z(unsigned long crc, const unsigned char FAR *buf,
* place to cache CPU features if needed for those later, more
* interesting crc32() calls.
*/
#if defined(CRC32_SIMD_SSE42_PCLMUL) || defined(CRC32_ARMV8_CRC32)
#if defined(CRC32_SIMD_SSE42_PCLMUL) || defined(CRC32_ARMV8_CRC32) \
|| defined(RISCV_RVV)
/*
* Since this routine can be freely used, check CPU features here.
*/
Expand Down Expand Up @@ -1085,7 +1086,8 @@ unsigned long ZEXPORT crc32(unsigned long crc, const unsigned char FAR *buf,
/* Some bots compile with optimizations disabled, others will emulate
* ARM on x86 and other weird combinations.
*/
#if defined(CRC32_SIMD_SSE42_PCLMUL) || defined(CRC32_ARMV8_CRC32)
#if defined(CRC32_SIMD_SSE42_PCLMUL) || defined(CRC32_ARMV8_CRC32) \
|| defined(RISCV_RVV)
/* We got to verify CPU features, so exploit the common usage pattern
* of calling this function with Z_NULL for an initial valid crc value.
* This allows to cache the result of the feature check and avoid extraneous
Expand Down
3 changes: 2 additions & 1 deletion deps/zlib/deflate.c
Expand Up @@ -401,7 +401,8 @@ int ZEXPORT deflateInit2_(z_streamp strm, int level, int method,
// for all wrapper formats (e.g. RAW, ZLIB, GZIP).
// Feature detection is not triggered while using RAW mode (i.e. we never
// call crc32() with a NULL buffer).
#if defined(CRC32_ARMV8_CRC32) || defined(CRC32_SIMD_SSE42_PCLMUL)
#if defined(CRC32_ARMV8_CRC32) || defined(CRC32_SIMD_SSE42_PCLMUL) \
|| defined(RISCV_RVV)
cpu_check_features();
#endif

Expand Down
2 changes: 1 addition & 1 deletion src/zlib_version.h
Expand Up @@ -2,5 +2,5 @@
// Refer to tools/dep_updaters/update-zlib.sh
#ifndef SRC_ZLIB_VERSION_H_
#define SRC_ZLIB_VERSION_H_
#define ZLIB_VERSION "1.3.0.1-motley-24342f6"
#define ZLIB_VERSION "1.3.0.1-motley-24c07df"
#endif // SRC_ZLIB_VERSION_H_