asembler je fasm za Linux. jedino je sin avx2 pa to mozete da vratite na avx1 lako gledajuci ostale f-je.
format elf64
define AVX2 1
macro polevl p1,p2,p3 {
local .L0
; %1 = ymm value,%2 = addr coef array , %3 = N
mov rbx,p2
lea rcx,[rbx+8*p3]
vbroadcastsd ymm15,[rbx]
.L0:
add rbx,8
if AVX2
vbroadcastsd ymm14,[rbx]
vfmadd213pd ymm15,p1,ymm14
else
vmulpd ymm15,ymm15,p1
vbroadcastsd ymm14,[rbx]
vaddpd ymm15,ymm15,ymm14
end if
cmp rbx,rcx
jl .L0
}
macro p1evl p1,p2,p3 {
local .L0
; %1 = ymm value,%2 = addr coef array , %3 = N
mov rbx,p2
lea rcx,[rbx+8*(p3-1)]
vbroadcastsd ymm15,[rbx]
vaddpd ymm15,ymm15,p1
.L0:
add rbx,8
if AVX2
vbroadcastsd ymm14,[rbx]
vfmadd213pd ymm15,p1,ymm14
else
vmulpd ymm15,ymm15,p1
vbroadcastsd ymm14,[rbx]
vaddpd ymm15,ymm15,ymm14
end if
cmp rbx,rcx
jl .L0
}
section '.text' executable
public tangent
public sine
public cosine
public sin_pd
public cos_pd
public tan_pd
public x87sincos
public sincos_pd
tangent:
sub rsp,8
vmovsd [rsp],xmm0
fld qword[rsp]
fptan
fstp st0
fstp qword[rsp]
vmovsd xmm0,[rsp]
add rsp,8
ret
sine:
sub rsp,8
vmovsd [rsp],xmm0
fld qword[rsp]
fsin
fstp qword[rsp]
vmovsd xmm0,[rsp]
add rsp,8
ret
cosine:
sub rsp,8
vmovsd [rsp],xmm0
fld qword[rsp]
fcos
fstp qword[rsp]
vmovsd xmm0,[rsp]
add rsp,8
ret
x87sincos:
sub rsp,8
vmovsd [rsp],xmm0
fld qword[rsp]
fsincos
fstp qword[rsi]
fstp qword[rdi]
add rsp,8
ret
; ymm0 -> x
sin_pd:
push rbx
vmovapd ymm1,ymm0
vmovapd ymm2,[sign_mask]
vandnpd ymm0,ymm2,ymm0 ; make positive x
vandpd ymm1,ymm1,[sign_mask] ; save sign bit
vbroadcastsd ymm2,[O4PI]
vmulpd ymm3,ymm0,ymm2 ;
vroundpd ymm3,ymm3,3 ; truncate y
vmulpd ymm2,ymm3,[OD16]
vroundpd ymm2,ymm2,3
; vmulpd ymm2,ymm2,[S16]
; vsubpd ymm2,ymm3,ymm2
vfnmadd132pd ymm2,ymm3,[S16]
vcvttpd2dq xmm2,ymm2 ; j
vpand xmm4,xmm2,[mask_1]
vpaddd xmm2,xmm2,xmm4 ; j += 1
vcvtdq2pd ymm4,xmm4
vaddpd ymm3,ymm3,ymm4 ; y += 1.0
vpand xmm4,xmm2,[mask_4]
vpslld xmm4,xmm4,29 ; move mask to highest position
if AVX2 ; just example, too lazy to repeat for other functions ;)
vpmovzxdq ymm4,xmm4
vpsllq ymm4,ymm4,32
else
vpmovzxdq xmm5,xmm4
vpsllq xmm5,xmm5,32
vpsrldq xmm4,xmm4,8
vpmovzxdq xmm6,xmm4
vpsllq xmm6,xmm6,32
vmovapd xmm4,xmm5
vinsertf128 ymm4,ymm4,xmm6,1
end if
vxorpd ymm1,ymm1,ymm4 ; invert sign
vpand xmm4,xmm2,[mask_3]
vpcmpeqd xmm4,xmm4,[mask_0]
if AVX2
vpmovsxdq ymm4,xmm4
else
vpmovsxdq xmm5,xmm4
vpsrldq xmm4,xmm4,8
vpmovsxdq xmm6,xmm4
vmovapd xmm4,xmm5
vinsertf128 ymm4,ymm4,xmm6,1 ; selection mask
end if
; Extended precision modular arithmetic
; vmulpd ymm5,ymm3,[DP1]
; vmulpd ymm6,ymm3,[DP2]
; vmulpd ymm7,ymm3,[DP3]
; vsubpd ymm0,ymm0,ymm5
; vsubpd ymm0,ymm0,ymm6
; vsubpd ymm0,ymm0,ymm7
vfnmadd231pd ymm0,ymm3,[DP1]
vfnmadd231pd ymm0,ymm3,[DP2]
vfnmadd231pd ymm0,ymm3,[DP3]
vmulpd ymm5,ymm0,ymm0 ; x^2
; first
polevl ymm5,sincof,5
vmulpd ymm15,ymm15,ymm5
; vmulpd ymm15,ymm15,ymm0
; vaddpd ymm6,ymm15,ymm0 ; y1
vfmadd132pd ymm15,ymm0,ymm0
vmovapd ymm6,ymm15
; second
polevl ymm5,coscof,5
vmulpd ymm15,ymm15,ymm5
vmulpd ymm15,ymm15,ymm5
; vmulpd ymm7,ymm5,[OD2]
; vsubpd ymm7,ymm15,ymm7
vfnmadd231pd ymm15,ymm5,[OD2]
vmovapd ymm7,ymm15
vaddpd ymm7,ymm7,[one]; y2
; combine
vandpd ymm6,ymm4,ymm6
vandnpd ymm7,ymm4,ymm7
vaddpd ymm0,ymm6,ymm7
vxorpd ymm0,ymm0,ymm1
pop rbx
ret
; ymm0 -> x
cos_pd:
push rbx
vmovapd ymm1,ymm0
vmovapd ymm2,[sign_mask]
vandnpd ymm0,ymm2,ymm0 ; make positive x
vandpd ymm1,ymm1,[sign_mask] ; save sign bit
vbroadcastsd ymm2,[O4PI]
vmulpd ymm3,ymm0,ymm2 ;
vroundpd ymm3,ymm3,3 ; truncate y
vmulpd ymm2,ymm3,[OD16]
vroundpd ymm2,ymm2,3
vmulpd ymm2,ymm2,[S16]
vsubpd ymm2,ymm3,ymm2
vcvttpd2dq xmm2,ymm2 ; j
vpand xmm4,xmm2,[mask_1]
vpaddd xmm2,xmm2,xmm4 ; j += 1
vcvtdq2pd ymm4,xmm4
vaddpd ymm3,ymm3,ymm4 ; y += 1.0
vpand xmm4,xmm2,[mask_4]
vpslld xmm4,xmm4,29 ; move mask to highest position
vpmovzxdq xmm5,xmm4
vpsllq xmm5,xmm5,32
vpsrldq xmm4,xmm4,8
vpmovzxdq xmm6,xmm4
vpsllq xmm6,xmm6,32
vmovapd xmm4,xmm5
vinsertf128 ymm4,ymm4,xmm6,1
vxorpd ymm1,ymm1,ymm4 ; invert sign
vpand xmm4,xmm2,[mask_3]
vpcmpeqd xmm4,xmm4,[mask_0]
vpmovsxdq xmm5,xmm4
vpsrldq xmm4,xmm4,8
vpmovsxdq xmm6,xmm4
vmovapd xmm4,xmm5
vinsertf128 ymm4,ymm4,xmm6,1 ; selection mask
; Extended precision modular arithmetic
vmulpd ymm5,ymm3,[DP1]
vmulpd ymm6,ymm3,[DP2]
vmulpd ymm7,ymm3,[DP3]
vsubpd ymm0,ymm0,ymm5
vsubpd ymm0,ymm0,ymm6
vsubpd ymm0,ymm0,ymm7
vmulpd ymm5,ymm0,ymm0 ; x^2
; first
polevl ymm5,sincof,5
vmulpd ymm15,ymm15,ymm5
vmulpd ymm15,ymm15,ymm0
vaddpd ymm6,ymm15,ymm0 ; y1
; second
polevl ymm5,coscof,5
vmulpd ymm15,ymm15,ymm5
vmulpd ymm15,ymm15,ymm5
vmulpd ymm7,ymm5,[OD2]
vsubpd ymm7,ymm15,ymm7
vaddpd ymm7,ymm7,[one]; y2
; combine
vandnpd ymm6,ymm4,ymm6
vandpd ymm7,ymm4,ymm7
vaddpd ymm0,ymm6,ymm7
vxorpd ymm0,ymm0,ymm1
pop rbx
ret
; ymm0 -> x
sincos_pd:
push rbx
vmovapd ymm1,ymm0
vmovapd ymm2,[sign_mask]
vandnpd ymm0,ymm2,ymm0 ; make positive x
vandpd ymm1,ymm1,[sign_mask] ; save sign bit
vbroadcastsd ymm2,[O4PI]
vmulpd ymm3,ymm0,ymm2 ;
vroundpd ymm3,ymm3,3 ; truncate y
vmulpd ymm2,ymm3,[OD16]
vroundpd ymm2,ymm2,3
vmulpd ymm2,ymm2,[S16]
vsubpd ymm2,ymm3,ymm2
vcvttpd2dq xmm2,ymm2 ; j
vpand xmm4,xmm2,[mask_1]
vpaddd xmm2,xmm2,xmm4 ; j += 1
vcvtdq2pd ymm4,xmm4
vaddpd ymm3,ymm3,ymm4 ; y += 1.0
vpand xmm4,xmm2,[mask_4]
vpslld xmm4,xmm4,29 ; move mask to highest position
vpmovzxdq xmm5,xmm4
vpsllq xmm5,xmm5,32
vpsrldq xmm4,xmm4,8
vpmovzxdq xmm6,xmm4
vpsllq xmm6,xmm6,32
vmovapd xmm4,xmm5
vinsertf128 ymm4,ymm4,xmm6,1
vxorpd ymm1,ymm1,ymm4 ; invert sign
vpand xmm4,xmm2,[mask_3]
vpcmpeqd xmm4,xmm4,[mask_0]
vpmovsxdq xmm5,xmm4
vpsrldq xmm4,xmm4,8
vpmovsxdq xmm6,xmm4
vmovapd xmm4,xmm5
vinsertf128 ymm4,ymm4,xmm6,1 ; selection mask
; Extended precision modular arithmetic
vmulpd ymm5,ymm3,[DP1]
vmulpd ymm6,ymm3,[DP2]
vmulpd ymm7,ymm3,[DP3]
vsubpd ymm0,ymm0,ymm5
vsubpd ymm0,ymm0,ymm6
vsubpd ymm0,ymm0,ymm7
vmulpd ymm5,ymm0,ymm0 ; x^2
; first
polevl ymm5,sincof,5
vmulpd ymm15,ymm15,ymm5
vmulpd ymm15,ymm15,ymm0
vaddpd ymm6,ymm15,ymm0 ; y1
; second
polevl ymm5,coscof,5
vmulpd ymm15,ymm15,ymm5
vmulpd ymm15,ymm15,ymm5
vmulpd ymm7,ymm5,[OD2]
vsubpd ymm7,ymm15,ymm7
vaddpd ymm7,ymm7,[one]; y2
; combine sin
vandpd ymm8,ymm4,ymm6
vandnpd ymm9,ymm4,ymm7
vaddpd ymm0,ymm8,ymm9
vxorpd ymm0,ymm0,ymm1
vmovupd [rdi],ymm0
; combine cos
vandnpd ymm6,ymm4,ymm6
vandpd ymm7,ymm4,ymm7
vaddpd ymm0,ymm6,ymm7
vxorpd ymm0,ymm0,ymm1
vmovupd [rsi],ymm0
pop rbx
ret
tan_pd:
push rbx
vmovapd ymm1,ymm0
vmovapd ymm2,[sign_mask]
vandnpd ymm0,ymm2,ymm0 ; make positive x
vandpd ymm1,ymm1,[sign_mask] ; save sign bit
vbroadcastsd ymm2,[O4PI]
vmulpd ymm3,ymm0,ymm2 ;
vroundpd ymm3,ymm3,3 ; truncate y
vmulpd ymm2,ymm3,[OD16]
vroundpd ymm2,ymm2,3
vmulpd ymm2,ymm2,[S16]
vsubpd ymm2,ymm3,ymm2
vcvttpd2dq xmm2,ymm2 ; j
vpand xmm4,xmm2,[mask_1]
vpaddd xmm2,xmm2,xmm4 ; j += 1
vcvtdq2pd ymm4,xmm4
vaddpd ymm3,ymm3,ymm4 ; y += 1.0
vpand xmm4,xmm2,[mask_2]
vpcmpeqd xmm4,xmm4,[mask_0]
vpmovsxdq xmm5,xmm4
vpsrldq xmm4,xmm4,8
vpmovsxdq xmm6,xmm4
vmovapd xmm4,xmm5
vinsertf128 ymm4,ymm4,xmm6,1 ; selection mask 2
; Extended precision modular arithmetic
vmulpd ymm5,ymm3,[DP1]
vmulpd ymm6,ymm3,[DP2]
vmulpd ymm7,ymm3,[DP3]
vsubpd ymm0,ymm0,ymm5
vsubpd ymm0,ymm0,ymm6
vsubpd ymm0,ymm0,ymm7
vmulpd ymm5,ymm0,ymm0 ; x^2
vcmpnlepd ymm6,ymm5,[prec] ; selection mask 1
;calculate polynom
polevl ymm5,P,2
vmovapd ymm13,ymm15
p1evl ymm5,Q,4
vdivpd ymm13,ymm13,ymm15
vmulpd ymm13,ymm13,ymm5
vmulpd ymm13,ymm13,ymm0
vaddpd ymm13,ymm13,ymm0
vandpd ymm13,ymm6,ymm13
vandnpd ymm0,ymm6,ymm0 ; select according to mask 1
vaddpd ymm0,ymm13,ymm0
vmovapd ymm6,[mone]
vdivpd ymm7,ymm6,ymm0
vandpd ymm0,ymm4,ymm0
vandnpd ymm7,ymm4,ymm7 ; select according to mask 2
vaddpd ymm0,ymm0,ymm7
vxorpd ymm0,ymm0,ymm1 ; invert sign
pop rbx
ret
section '.data' writeable align 32
sincof dq \
1.58962301576546568060E-10,\
-2.50507477628578072866E-8,\
2.75573136213857245213E-6,\
-1.98412698295895385996E-4,\
8.33333333332211858878E-3,\
-1.66666666666666307295E-1
coscof dq \
-1.13585365213876817300E-11,\
2.08757008419747316778E-9,\
-2.75573141792967388112E-7,\
2.48015872888517045348E-5,\
-1.38888888888730564116E-3,\
4.16666666666665929218E-2
P dq \
-1.30936939181383777646E4, \
1.15351664838587416140E6, \
-1.79565251976484877988E7
Q dq \
1.36812963470692954678E4, \
-1.32089234440210967447E6,\
2.50083801823357915839E7,\
-5.38695755929454629881E7
O4PI dq 1.273239544735162
align 32
DP1: times 4 dq 7.85398125648498535156E-1;
DP2: times 4 dq 3.77489470793079817668E-8;
DP3: times 4 dq 2.69515142907905952645E-15;
sign_mask: times 4 dq 0x8000000000000000
prec: times 4 dq 1.0e-14
one: times 4 dq 1.0
mone: times 4 dq -1.0
OD16: times 4 dq 0.0625
S16: times 4 dq 16.0
OD2: times 4 dq 0.5
mask_0: times 4 dd 0
mask_1: times 4 dd 1
mask_2: times 4 dd 2
mask_3: times 4 dd 3
mask_4: times 4 dd 4
Evo kako se koristi:
#include <math.h>
#include <time.h>
#include <stdio.h>
#include <immintrin.h>
extern "C" double tangent(double x);
extern "C" double sine(double x);
extern "C" double cosine(double x);
extern "C" void x87sincos(double x,double* rsin,double* rcos);
extern "C" double md_sin(double x);
extern "C" double md_cos(double x);
extern "C" double md_tan(double x);
extern "C" __m256d sin_pd(__m256d x);
extern "C" __m256d cos_pd(__m256d x);
extern "C" void sincos_pd(__m256d x,__m256d* rsin,__m256d* rcos);
extern "C" __m256d tan_pd(__m256d x);
typedef double(*fp_t)(double);
typedef void(*fpb_t)(double,double*,double*);
typedef __m256d(*fpv_t)(__m256d);
typedef void(*fpvb_t)(__m256d,__m256d*,__m256d*);
#define NITER 10000000
void test(fp_t f, const char* str)
{
volatile double x = -1000.0, y;
clock_t begin = clock();
for(int i = 0;i<NITER;++i)
{
y = f(x);
x+=0.001;
}
clock_t end = clock();
double diff = double(end-begin)/CLOCKS_PER_SEC;
printf("%.15f %s time %f\n",y,str,diff);
}
void test(fpb_t f, const char* str)
{
double x = -1000.0, s,c;
clock_t begin = clock();
for(int i = 0;i<NITER;++i)
{
f(x,&s,&c);
x+=0.001;
}
clock_t end = clock();
double diff = double(end-begin)/CLOCKS_PER_SEC;
printf("%.15f %.15f %s time %f\n",s,c,str,diff);
}
void test(fpv_t f, const char* str)
{
volatile __m256d x = _mm256_set1_pd(-1000.0),one = _mm256_set1_pd(0.001),y;
double res[4];
clock_t begin = clock();
for(int i = 0;i<NITER;++i)
{
y = f(x);
x = _mm256_add_pd(x,one);
}
clock_t end = clock();
double diff = double(end-begin)/CLOCKS_PER_SEC;
_mm256_storeu_pd(res,y);
printf("%.15f %s time %f\n",res[2],str,diff);
}
void test(fpvb_t f, const char* str)
{
__m256d x = _mm256_set1_pd(-1000.0),one = _mm256_set1_pd(0.001);
double ress[4],resc[4];
clock_t begin = clock();
for(int i = 0;i<NITER;++i)
{
f(x,(__m256d*)ress,(__m256d*)resc);
x = _mm256_add_pd(x,one);
}
clock_t end = clock();
double diff = double(end-begin)/CLOCKS_PER_SEC;
printf("%.15f %.15f %s time %f\n",
ress[0],resc[3],str,diff);
}
int main()
{
test(tangent,"x87 tangent");
test(tangent,"x87 tangent");
test(tan,"clib tan");
test(md_tan,"cephes tan");
test(tan_pd,"tan_pd");
test(sine,"x87 sine");
test(sin,"clib sin");
test(md_sin,"cephes sin");
test(sin_pd,"sin_pd");
test(cosine,"x87 cosine");
test(cos,"clib cos");
test(md_cos,"cephes cos");
test(cos_pd,"cos_pd");
test(x87sincos,"x87 sincos");
test(sincos,"clib sincos");
test(sincos_pd,"sincos_pd");
}