-
-
Notifications
You must be signed in to change notification settings - Fork 9.5k
/
conversion.h
146 lines (133 loc) · 5.06 KB
/
conversion.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#ifndef NPY_SIMD
#error "Not a standalone header"
#endif
#ifndef _NPY_SIMD_VSX_CVT_H
#define _NPY_SIMD_VSX_CVT_H
// convert boolean vectors to integer vectors
#define npyv_cvt_u8_b8(BL) ((npyv_u8) BL)
#define npyv_cvt_s8_b8(BL) ((npyv_s8) BL)
#define npyv_cvt_u16_b16(BL) ((npyv_u16) BL)
#define npyv_cvt_s16_b16(BL) ((npyv_s16) BL)
#define npyv_cvt_u32_b32(BL) ((npyv_u32) BL)
#define npyv_cvt_s32_b32(BL) ((npyv_s32) BL)
#define npyv_cvt_u64_b64(BL) ((npyv_u64) BL)
#define npyv_cvt_s64_b64(BL) ((npyv_s64) BL)
#define npyv_cvt_f32_b32(BL) ((npyv_f32) BL)
#define npyv_cvt_f64_b64(BL) ((npyv_f64) BL)
// convert integer vectors to boolean vectors
#define npyv_cvt_b8_u8(A) ((npyv_b8) A)
#define npyv_cvt_b8_s8(A) ((npyv_b8) A)
#define npyv_cvt_b16_u16(A) ((npyv_b16) A)
#define npyv_cvt_b16_s16(A) ((npyv_b16) A)
#define npyv_cvt_b32_u32(A) ((npyv_b32) A)
#define npyv_cvt_b32_s32(A) ((npyv_b32) A)
#define npyv_cvt_b64_u64(A) ((npyv_b64) A)
#define npyv_cvt_b64_s64(A) ((npyv_b64) A)
#define npyv_cvt_b32_f32(A) ((npyv_b32) A)
#define npyv_cvt_b64_f64(A) ((npyv_b64) A)
//expand
NPY_FINLINE npyv_u16x2 npyv_expand_u16_u8(npyv_u8 data)
{
npyv_u16x2 r;
npyv_u8 zero = npyv_zero_u8();
r.val[0] = (npyv_u16)vec_mergeh(data, zero);
r.val[1] = (npyv_u16)vec_mergel(data, zero);
return r;
}
NPY_FINLINE npyv_u32x2 npyv_expand_u32_u16(npyv_u16 data)
{
npyv_u32x2 r;
npyv_u16 zero = npyv_zero_u16();
r.val[0] = (npyv_u32)vec_mergeh(data, zero);
r.val[1] = (npyv_u32)vec_mergel(data, zero);
return r;
}
// pack two 16-bit boolean into one 8-bit boolean vector
NPY_FINLINE npyv_b8 npyv_pack_b8_b16(npyv_b16 a, npyv_b16 b) {
return vec_pack(a, b);
}
// pack four 32-bit boolean vectors into one 8-bit boolean vector
NPY_FINLINE npyv_b8 npyv_pack_b8_b32(npyv_b32 a, npyv_b32 b, npyv_b32 c, npyv_b32 d) {
npyv_b16 ab = vec_pack(a, b);
npyv_b16 cd = vec_pack(c, d);
return npyv_pack_b8_b16(ab, cd);
}
// pack eight 64-bit boolean vectors into one 8-bit boolean vector
NPY_FINLINE npyv_b8
npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d,
npyv_b64 e, npyv_b64 f, npyv_b64 g, npyv_b64 h) {
npyv_b32 ab = vec_pack(a, b);
npyv_b32 cd = vec_pack(c, d);
npyv_b32 ef = vec_pack(e, f);
npyv_b32 gh = vec_pack(g, h);
return npyv_pack_b8_b32(ab, cd, ef, gh);
}
// convert boolean vector to integer bitfield
NPY_FINLINE npy_uint64 npyv_tobits_b8(npyv_b8 a)
{
const npyv_u8 qperm = npyv_set_u8(120, 112, 104, 96, 88, 80, 72, 64, 56, 48, 40, 32, 24, 16, 8, 0);
return vec_extract((npyv_u32)vec_vbpermq((npyv_u8)a, qperm), 2);
}
NPY_FINLINE npy_uint64 npyv_tobits_b16(npyv_b16 a)
{
const npyv_u8 qperm = npyv_setf_u8(128, 112, 96, 80, 64, 48, 32, 16, 0);
return vec_extract((npyv_u32)vec_vbpermq((npyv_u8)a, qperm), 2);
}
NPY_FINLINE npy_uint64 npyv_tobits_b32(npyv_b32 a)
{
const npyv_u8 qperm = npyv_setf_u8(128, 96, 64, 32, 0);
return vec_extract((npyv_u32)vec_vbpermq((npyv_u8)a, qperm), 2);
}
NPY_FINLINE npy_uint64 npyv_tobits_b64(npyv_b64 a)
{
npyv_u64 bit = npyv_shri_u64((npyv_u64)a, 63);
return vec_extract(bit, 0) | (int)vec_extract(bit, 1) << 1;
}
// truncate compatible with all compilers(internal use for now)
NPY_FINLINE npyv_s32 npyv__trunc_s32_f32(npyv_f32 a)
{
#ifdef __IBMC__
return vec_cts(a, 0);
#elif defined(__clang__)
/**
* old versions of CLANG doesn't support %x<n> in the inline asm template
* which fixes register number when using any of the register constraints wa, wd, wf.
* therefore, we count on built-in functions.
*/
return __builtin_convertvector(a, npyv_s32);
#else // gcc
npyv_s32 ret;
__asm__ ("xvcvspsxws %x0,%x1" : "=wa" (ret) : "wa" (a));
return ret;
#endif
}
NPY_FINLINE npyv_s32 npyv__trunc_s32_f64(npyv_f64 a, npyv_f64 b)
{
#ifdef __IBMC__
const npyv_u8 seq_even = npyv_set_u8(0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27);
// unfortunately, XLC missing asm register vsx fixer
// hopefully, xlc can optimize around big-endian compatibility
npyv_s32 lo_even = vec_cts(a, 0);
npyv_s32 hi_even = vec_cts(b, 0);
return vec_perm(lo_even, hi_even, seq_even);
#else
const npyv_u8 seq_odd = npyv_set_u8(4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31);
#ifdef __clang__
// __builtin_convertvector doesn't support this conversion on wide range of versions
// fortunately, almost all versions have direct builtin of 'xvcvdpsxws'
npyv_s32 lo_odd = __builtin_vsx_xvcvdpsxws(a);
npyv_s32 hi_odd = __builtin_vsx_xvcvdpsxws(b);
#else // gcc
npyv_s32 lo_odd, hi_odd;
__asm__ ("xvcvdpsxws %x0,%x1" : "=wa" (lo_odd) : "wa" (a));
__asm__ ("xvcvdpsxws %x0,%x1" : "=wa" (hi_odd) : "wa" (b));
#endif
return vec_perm(lo_odd, hi_odd, seq_odd);
#endif
}
// round to nearest integer (assuming even)
NPY_FINLINE npyv_s32 npyv_round_s32_f32(npyv_f32 a)
{ return npyv__trunc_s32_f32(vec_rint(a)); }
NPY_FINLINE npyv_s32 npyv_round_s32_f64(npyv_f64 a, npyv_f64 b)
{ return npyv__trunc_s32_f64(vec_rint(a), vec_rint(b)); }
#endif // _NPY_SIMD_VSX_CVT_H