Skip to content

Commit

Permalink
tumbling down (test AVX2)
Browse files Browse the repository at this point in the history
  • Loading branch information
seiko2plus committed Aug 11, 2020
1 parent 95c485a commit 8d4ae79
Show file tree
Hide file tree
Showing 4 changed files with 231 additions and 247 deletions.
5 changes: 2 additions & 3 deletions numpy/core/src/common/simd/avx2/conversion.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,10 @@ NPY_FINLINE npyv_b8 npyv_pack_b16(npyv_b16 a, npyv_b16 b)
// pack four 32-bit boolean vectors into one 8-bit boolean vector
NPY_FINLINE npyv_b8 npyv_pack_b8_b32(npyv_b32 a, npyv_b32 b, npyv_b32 c, npyv_b32 d)
{
const __m256i perm = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7);
__m256i ab = _mm256_packs_epi32(a, b);
__m256i cd = _mm256_packs_epi32(c, d);
__m256i abcd = _mm256_packs_epi16(ab, cd);
return _mm256_permutevar8x32_epi32(abcd, perm);
__m256i abcd = npyv_pack_b16(ab, cd);
return _mm256_shuffle_epi32(abcd, _MM_SHUFFLE(3, 1, 2, 0));
}
// pack eight 64-bit boolean vectors into one 8-bit boolean vector
NPY_FINLINE npyv_b16 npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d,
Expand Down
14 changes: 14 additions & 0 deletions numpy/core/src/common/simd/avx2/memory.h
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,19 @@ NPY_FINLINE npyv_s32 npyv_loadn_s32(const npy_int32 *ptr, int stride)
NPY_FINLINE npyv_f32 npyv_loadn_f32(const float *ptr, int stride)
{ return _mm256_castsi256_ps(npyv_loadn_u32((const npy_uint32*)ptr, stride)); }
//// 64
NPY_FINLINE npyv_f64 npyv_loadn_f64(const double *ptr, int stride)
{
__m128d a0 = _mm_castsi128_pd(_mm_loadl_epi64((const __m128i*)ptr));
__m128d a2 = _mm_castsi128_pd(_mm_loadl_epi64((const __m128i*)(ptr + stride*2)));
__m128d a01 = _mm_loadh_pd(a0, ptr + stride);
__m128d a23 = _mm_loadh_pd(a2, ptr + stride*3);
return _mm256_insertf128_pd(_mm256_castpd128_pd256(a01), a23, 1);
}
NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, int stride)
{ return _mm256_castpd_si256(npyv_loadn_f64((const double*)ptr, stride)); }
NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, int stride)
{ return _mm256_castpd_si256(npyv_loadn_f64((const double*)ptr, stride)); }
/*
NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, int stride)
{
const __m128i steps = _mm_setr_epi32(0, 1, 2, 3);
Expand All @@ -127,6 +140,7 @@ NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, int stride)
{ return npyv_loadn_u64((const npy_uint64*)ptr, stride); }
NPY_FINLINE npyv_f64 npyv_loadn_f64(const double *ptr, int stride)
{ return _mm256_castsi256_pd(npyv_loadn_u64((const npy_uint64*)ptr, stride)); }
*/

/***************************
* Non-contiguous Store
Expand Down
11 changes: 9 additions & 2 deletions numpy/core/src/common/simd/avx2/operators.h
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ NPY_FINLINE __m256i npyv_shr_s64(__m256i a, int c)
#define npyv_xor_f64 _mm256_xor_pd

// NOT
#define npyv_not_u8(A) _mm256_xor_si256(A, _mm256_set1_epi32(-1))
#define npyv_not_u8(A) _mm256_andnot_si256(A, _mm256_set1_epi32(-1))
#define npyv_not_s8 npyv_not_u8
#define npyv_not_b8 npyv_not_u8
#define npyv_not_u16 npyv_not_u8
Expand Down Expand Up @@ -151,6 +151,7 @@ NPY_FINLINE __m256i npyv_shr_s64(__m256i a, int c)
#define npyv_cmpge_s64(A, B) npyv_not_s64(_mm256_cmpgt_epi64(B, A))

// unsigned greater than
/*
#define NPYV_IMPL_AVX2_UNSIGNED_GT(LEN, SIGN) \
NPY_FINLINE __m256i npyv_cmpgt_u##LEN(__m256i a, __m256i b) \
{ \
Expand All @@ -163,7 +164,13 @@ NPY_FINLINE __m256i npyv_shr_s64(__m256i a, int c)
NPYV_IMPL_AVX2_UNSIGNED_GT(8, 0x80808080)
NPYV_IMPL_AVX2_UNSIGNED_GT(16, 0x80008000)
NPYV_IMPL_AVX2_UNSIGNED_GT(32, 0x80000000)

*/
NPY_FINLINE __m256i npyv_cmpgt_u8(__m256i a, __m256i b)
{ return npyv_not_u8(_mm256_cmpeq_epi8(b, _mm256_max_epu8(b, a))); }
NPY_FINLINE __m256i npyv_cmpgt_u16(__m256i a, __m256i b)
{ return npyv_not_u16(_mm256_cmpeq_epi16(b, _mm256_max_epu16(b, a))); }
NPY_FINLINE __m256i npyv_cmpgt_u32(__m256i a, __m256i b)
{ return npyv_not_u32(_mm256_cmpeq_epi32(b, _mm256_max_epu32(b, a))); }
NPY_FINLINE __m256i npyv_cmpgt_u64(__m256i a, __m256i b)
{
const __m256i sbit = _mm256_set1_epi64x(0x8000000000000000);
Expand Down

0 comments on commit 8d4ae79

Please sign in to comment.