-
Notifications
You must be signed in to change notification settings - Fork 1
/
vsinf.cpp
171 lines (125 loc) · 5.96 KB
/
vsinf.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
/*
vsinf.cpp
Created by damien on 8/03/09.
Copyright 2009 Damien Morton. All rights reserved.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising
from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must
not claim that you wrote the original software. If you use this
software in a product, an acknowledgment in the product documentation
would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must
not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
#include "vsinf.h"
#include <math.h>
float fast_sin(float x);
inline float abs(float x) { return (x < 0) ? -x : x; }
float fast_sin(float x)
{
// fast sin function; maximum error is 0.001
const float P = 0.225;
x = x * M_1_PI;
int k = (int) round(x);
x = x - k;
float y = (4 - 4 * abs(x)) * x;
y = P * (y * abs(y) - y) + y;
return (k&1) ? -y : y;
}
void vsinf(float *x, float *y, int n) // input x, output y, n multiple of 4
{
// source and destination arrays can be the same
const float P = 0.225;
float consts[4] = { M_1_PI, 1.0, 4.0, P};
const float neg1 = -1.0;
int a, b, c, d;
b = (int) consts;
asm volatile (
"bic %[n], %[n], #3 \n\t" // n has to be a multiple of 4 - lets enforce that (and avoid infinte loops)
"fldmias %[b], {s0-s3} \n\t" // load up our constants
"fldmias %[x]!, {s8-s11} \n\t" // v2 = x // our loop is a little bit overlapped - preload some stuff
"fmrx %[a], fpscr \n\t"
"stmfd sp!, {%[a]} \n\t" // save fpscr to the stack (dont forget to restore it and the stack pointer)
"bic %[a], %[a], #0x001f \n\t" // for runfast - clear CumulativeException[4:0] bits
"bic %[a], %[a], #0x1f00 \n\t" // for runfast - clear ExceptionTrapEnable[12:8] bits
"orr %[a], %[a], #(0x3 << 24) \n\t" // for runfast - set DefaultNaN[25] and FushToZero[24] bits
"bic %[a], %[a], #(0x37 << 16) \n\t" // clear the vector stride[21:20] and len[18:16] bits
"orr %[a], %[a], #(0x03 << 16) \n\t" // set stride[21:20] to 1(0), vector len[18:16] to 4(3)
"fmxr fpscr, %[a] \n\t"
"fmuls s8, s8, s0 \n\t" // v3: = x * (1/PI)
"ftosis s12, s8 \n\t" // v3: k = round(x)
"ftosis s13, s9 \n\t"
"ftosis s14, s10 \n\t"
"ftosis s15, s11 \n\t"
"fmrrs %[a],%[b], {s12,s13} \n\t"
"fsitos s12, s12 \n\t" // k = (float) k
"fsitos s13, s13 \n\t"
"fmrrs %[c],%[d], {s14,s15} \n\t"
"fsitos s14, s14 \n\t"
"fsitos s15, s15 \n\t"
"1: \n\t"
//OOO out of order instructions marked with OOO and are usually suffixed with the ne condition
"fsubs s8, s8, s12 \n\t" // x = x - k
"fcpys s28, s1 \n\t" // v7 = 1111
"fcpys s24, s2 \n\t" // OOO v6 = 4444
//float x = x * M_1_PI;
//int k = round(x);
//float x = x - k;
"fstmiasne %[y]!, {s20-s23} \n\t" // OOO: from previous loop
"tst %[a], #1 \n\t" // if odd(n) then negate result at end
"fmsrne s28, %[neg1] \n\t" // shove a -1 into the appropriate part of v7
"tst %[b], #1 \n\t" // 4 tests, spread through code - result not needed till end
"fmsrne s29, %[neg1] \n\t"
"fabss s16, s8 \n\t" // x' = abs(x)
// y = (4 - 4 * abs(x)) * x;
"tst %[c], #1 \n\t"
"fmsrne s30, %[neg1] \n\t"
"fnmacs s24, s16, s2 \n\t" // y' = 4 - 4*abs(x)
"tst %[d], #1 \n\t"
"fmsrne s31, %[neg1] \n\t"
"fmuls s16, s24, s8 \n\t" // y' *= x
"subs %[n], %[n], #4 \n\t" // testing early here
"fldmiasne %[x]!, {s8-s11} \n\t" // v2 = x // our loop is a little bit overlapped - preload some stuff
"pld [%[x],#64] \n\t" // In the ARM1176JZF-S processor, in Non-secure state, the PLD instruction behaves like a NOP. Dammit.
// y += P * (y * abs(y) - y);
"fabss s20, s16 \n\t" // v4 = abs(y)
"fcpys s24, s16 \n\t"
"fmulsne s8, s8, s0 \n\t" // OOO v2 = x * (1/PI)
"fmscs s16, s16, s20 \n\t" // v3 = -y + y*abs(y)
"ftosisne s12, s8 \n\t" // OOO v3: k = round(x)
"ftosisne s13, s9 \n\t"
"ftosisne s14, s10 \n\t"
"ftosisne s15, s11 \n\t"
"fmacs s24, s16, s3 \n\t" // y += P*v3
"fmrrsne %[a],%[b], {s12,s13} \n\t"
"fsitosne s12, s12 \n\t" // k = (float) k
"fsitosne s13, s13 \n\t"
"fmrrsne %[c],%[d], {s14,s15} \n\t"
"fsitosne s14, s14 \n\t"
"fsitosne s15, s15 \n\t"
"fmuls s20, s28, s24 \n\t" // multiply result by +1/-1 according to oddness of n
"bne 1b \n\t"
"fstmias %[y]!, {s20-s23} \n\t"
"ldmfd sp!, {%[b]} \n\t" // restore the fpscr
"fmxr fpscr, %[b] \n\t"
: [x] "+&r" (x),
[y] "+&r" (y),
[n] "+&r" (n),
[neg1] "+&r" (neg1),
[a] "+&r" (a),
[b] "+&r" (b),
[c] "+&r" (c),
[d] "+&r" (d)
:
: "memory", "cc",
"s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7",
"s8", "s9", "s10", "s11", "s12", "s13", "s14", "s15",
"s16", "s17", "s18", "s19", "s20", "s21", "s22", "s23",
"s24", "s25", "s26", "s27", "s28", "s29", "s30", "s31"
);
}