Skip to content

Commit

Permalink
Sync bela
Browse files Browse the repository at this point in the history
  • Loading branch information
fcharlie committed Jul 29, 2023
1 parent 57c258b commit 60bd8d2
Show file tree
Hide file tree
Showing 5 changed files with 58 additions and 12 deletions.
2 changes: 1 addition & 1 deletion src/belahash/blake3.lock
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
https://github.com/BLAKE3-team/BLAKE3
3f396d223946f722ab060fe9377cd1cebacaf4c0
12823b87604cbb7bcd0e5cdc347a53f80b2a617c
2 changes: 1 addition & 1 deletion src/belahash/blake3/blake3.c
Original file line number Diff line number Diff line change
Expand Up @@ -254,7 +254,7 @@ INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values,
// As a special case when the SIMD degree is 1, this function will still return
// at least 2 outputs. This guarantees that this function doesn't perform the
// root compression. (If it did, it would use the wrong flags, and also we
// wouldn't be able to implement exendable output.) Note that this function is
// wouldn't be able to implement extendable output.) Note that this function is
// not used when the whole input is only 1 chunk long; that's a different
// codepath.
//
Expand Down
2 changes: 1 addition & 1 deletion src/belahash/blake3/blake3.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
extern "C" {
#endif

#define BLAKE3_VERSION_STRING "1.4.0"
#define BLAKE3_VERSION_STRING "1.4.1"
#define BLAKE3_KEY_LEN 32
#define BLAKE3_OUT_LEN 32
#define BLAKE3_BLOCK_LEN 64
Expand Down
39 changes: 34 additions & 5 deletions src/belahash/blake3/blake3_dispatch.c
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

#if defined(IS_X86)
#if defined(_MSC_VER)
#include <Windows.h>
#include <intrin.h>
#elif defined(__GNUC__)
#include <immintrin.h>
Expand All @@ -14,6 +15,32 @@
#endif
#endif

#if !defined(BLAKE3_ATOMICS)
#if defined(__has_include)
#if __has_include(<stdatomic.h>) && !defined(_MSC_VER)
#define BLAKE3_ATOMICS 1
#else
#define BLAKE3_ATOMICS 0
#endif /* __has_include(<stdatomic.h>) && !defined(_MSC_VER) */
#else
#define BLAKE3_ATOMICS 0
#endif /* defined(__has_include) */
#endif /* BLAKE3_ATOMICS */

#if BLAKE3_ATOMICS
#define ATOMIC_INT _Atomic int
#define ATOMIC_LOAD(x) x
#define ATOMIC_STORE(x, y) x = y
#elif defined(_MSC_VER)
#define ATOMIC_INT LONG
#define ATOMIC_LOAD(x) InterlockedOr(&x, 0)
#define ATOMIC_STORE(x, y) InterlockedExchange(&x, y)
#else
#define ATOMIC_INT int
#define ATOMIC_LOAD(x) x
#define ATOMIC_STORE(x, y) x = y
#endif

#define MAYBE_UNUSED(x) (void)((x))

#if defined(IS_X86)
Expand Down Expand Up @@ -76,22 +103,24 @@ enum cpu_feature {
#if !defined(BLAKE3_TESTING)
static /* Allow the variable to be controlled manually for testing */
#endif
enum cpu_feature g_cpu_features = UNDEFINED;
ATOMIC_INT g_cpu_features = UNDEFINED;

#if !defined(BLAKE3_TESTING)
static
#endif
enum cpu_feature
get_cpu_features(void) {

if (g_cpu_features != UNDEFINED) {
return g_cpu_features;
/* If TSAN detects a data race here, try compiling with -DBLAKE3_ATOMICS=1 */
enum cpu_feature features = ATOMIC_LOAD(g_cpu_features);
if (features != UNDEFINED) {
return features;
} else {
#if defined(IS_X86)
uint32_t regs[4] = {0};
uint32_t *eax = &regs[0], *ebx = &regs[1], *ecx = &regs[2], *edx = &regs[3];
(void)edx;
enum cpu_feature features = 0;
features = 0;
cpuid(regs, 0);
const int max_id = *eax;
cpuid(regs, 1);
Expand Down Expand Up @@ -124,7 +153,7 @@ static
}
}
}
g_cpu_features = features;
ATOMIC_STORE(g_cpu_features, features);
return features;
#else
/* How to detect NEON? */
Expand Down
25 changes: 21 additions & 4 deletions src/belahash/blake3/blake3_neon.c
Original file line number Diff line number Diff line change
Expand Up @@ -36,19 +36,36 @@ INLINE uint32x4_t set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
}

INLINE uint32x4_t rot16_128(uint32x4_t x) {
return vorrq_u32(vshrq_n_u32(x, 16), vshlq_n_u32(x, 32 - 16));
// The straightfoward implementation would be two shifts and an or, but that's
// slower on microarchitectures we've tested. See
// https://github.com/BLAKE3-team/BLAKE3/pull/319.
// return vorrq_u32(vshrq_n_u32(x, 16), vshlq_n_u32(x, 32 - 16));
return vreinterpretq_u32_u16(vrev32q_u16(vreinterpretq_u16_u32(x)));
}

INLINE uint32x4_t rot12_128(uint32x4_t x) {
return vorrq_u32(vshrq_n_u32(x, 12), vshlq_n_u32(x, 32 - 12));
// See comment in rot16_128.
// return vorrq_u32(vshrq_n_u32(x, 12), vshlq_n_u32(x, 32 - 12));
return vsriq_n_u32(vshlq_n_u32(x, 32-12), x, 12);
}

INLINE uint32x4_t rot8_128(uint32x4_t x) {
return vorrq_u32(vshrq_n_u32(x, 8), vshlq_n_u32(x, 32 - 8));
// See comment in rot16_128.
// return vorrq_u32(vshrq_n_u32(x, 8), vshlq_n_u32(x, 32 - 8));
#if defined(__clang__)
return vreinterpretq_u32_u8(__builtin_shufflevector(vreinterpretq_u8_u32(x), vreinterpretq_u8_u32(x), 1,2,3,0,5,6,7,4,9,10,11,8,13,14,15,12));
#elif __GNUC__ * 10000 + __GNUC_MINOR__ * 100 >=40700
static const uint8x16_t r8 = {1,2,3,0,5,6,7,4,9,10,11,8,13,14,15,12};
return vreinterpretq_u32_u8(__builtin_shuffle(vreinterpretq_u8_u32(x), vreinterpretq_u8_u32(x), r8));
#else
return vsriq_n_u32(vshlq_n_u32(x, 32-8), x, 8);
#endif
}

INLINE uint32x4_t rot7_128(uint32x4_t x) {
return vorrq_u32(vshrq_n_u32(x, 7), vshlq_n_u32(x, 32 - 7));
// See comment in rot16_128.
// return vorrq_u32(vshrq_n_u32(x, 7), vshlq_n_u32(x, 32 - 7));
return vsriq_n_u32(vshlq_n_u32(x, 32-7), x, 7);
}

// TODO: compress_neon
Expand Down

0 comments on commit 60bd8d2

Please sign in to comment.