1 | #ifdef __OPENCL_VERSION__ |
---|
2 | # define USE_OPENCL |
---|
3 | #endif |
---|
4 | |
---|
5 | // If opencl is not available |
---|
6 | #ifndef USE_OPENCL |
---|
7 | # include <math.h> |
---|
8 | # define REAL(x) (x) |
---|
9 | # define real double |
---|
10 | # define global |
---|
11 | # define local |
---|
12 | # define kernel extern "C" |
---|
13 | #endif |
---|
14 | |
---|
15 | #ifndef M_PI |
---|
16 | # define M_PI REAL(3.141592653589793) |
---|
17 | #endif |
---|
18 | #define RADIANS REAL(0.017453292519943295) |
---|
19 | #define PI_E8 REAL(3.141592653589793e8) |
---|
20 | |
---|
21 | real f(real qx, real qy, real sub, real radius_a, real radius_b, real scale, real axis_theta, real axis_phi); |
---|
22 | real f(real qx, real qy, real sub, real radius_a, real radius_b, real scale, real axis_theta, real axis_phi) |
---|
23 | { |
---|
24 | real ret = 0; |
---|
25 | const real q = sqrt(qx*qx + qy*qy); |
---|
26 | const real theta = axis_theta*RADIANS; |
---|
27 | const real cos_val = cos(theta)*cos(axis_phi*RADIANS)*(qx/q) + sin(theta)*(qy/q); |
---|
28 | |
---|
29 | const real arg = q*radius_b*sqrt(1.0+(cos_val*cos_val*(((radius_a*radius_a/(radius_b*radius_b))-1.0)))); |
---|
30 | if(arg == 0.0){ |
---|
31 | ret = 1.0/3.0; |
---|
32 | } |
---|
33 | else{ |
---|
34 | ret = (sin(arg)-arg*cos(arg))/(arg*arg*arg); |
---|
35 | } |
---|
36 | ret*=ret*9.0*sub*sub*4.0/3.0*acos(-1.0)*radius_b*radius_b*radius_a*(1.0e8); |
---|
37 | |
---|
38 | return radius_a*scale*ret*pow(radius_b, 2); |
---|
39 | } |
---|
40 | |
---|
41 | kernel void EllipsoidKernel(global const real *qx, global const real *qy, global real *result, |
---|
42 | #ifdef USE_OPENCL |
---|
43 | global real *loops_g, |
---|
44 | #else |
---|
45 | const int Nq, |
---|
46 | #endif |
---|
47 | local real *loops, const real cutoff, |
---|
48 | const real scale, const real background, |
---|
49 | const real sub, |
---|
50 | const int Nradius_a, const int Nradius_b, const int Ntheta, const int Nphi) |
---|
51 | { |
---|
52 | #ifdef USE_OPENCL |
---|
53 | // copy loops info to local memory |
---|
54 | event_t e = async_work_group_copy(loops, loops_g, (Nradius_a+Nradius_b+Ntheta+Nphi)*2, 0); |
---|
55 | wait_group_events(1, &e); |
---|
56 | |
---|
57 | int i = get_global_id(0); |
---|
58 | int count = get_global_size(0); |
---|
59 | |
---|
60 | if(i < count) |
---|
61 | #else |
---|
62 | #pragma omp parallel for |
---|
63 | for (int i=0; i<Nq; i++) |
---|
64 | #endif |
---|
65 | { |
---|
66 | const real qxi=qx[i]; |
---|
67 | const real qyi=qy[i]; |
---|
68 | real ret=REAL(0.0), norm=REAL(0.0), norm_vol=REAL(0.0), vol=REAL(0.0); |
---|
69 | for (int ai=0; ai < Nradius_a; ai++) { |
---|
70 | const real rav = loops[2*ai]; |
---|
71 | const real raw = loops[2*ai+1]; |
---|
72 | for (int bi=0; bi < Nradius_b; bi++) { |
---|
73 | const real rbv = loops[2*(bi+Nradius_a)]; |
---|
74 | const real rbw = loops[2*(bi+Nradius_a)+1]; |
---|
75 | for (int thi=0; thi < Ntheta; thi++) { |
---|
76 | const real thv = loops[2*(thi+Nradius_a+Nradius_b)]; |
---|
77 | const real thw = loops[2*(thi+Nradius_a+Nradius_b)+1]; |
---|
78 | // #pragma unroll |
---|
79 | for (int phi=0; phi < Nphi; phi++) { |
---|
80 | const real phv = loops[2*(phi+Nradius_a+Nradius_b+Ntheta)]; |
---|
81 | const real phw = loops[2*(phi+Nradius_a+Nradius_b+Ntheta)+1]; |
---|
82 | |
---|
83 | const real weight = raw*rbw*thw*phw; |
---|
84 | //ret += qxi + qyi + sub + rav + rbv + weight + thv + phv; |
---|
85 | if (weight > cutoff) { |
---|
86 | ret += f(qxi, qyi, sub, rav, rbv, weight, thv, phv); |
---|
87 | norm += weight; |
---|
88 | vol += raw*rbw*rav*rbv*rbv; |
---|
89 | norm_vol += raw*rbw; |
---|
90 | } |
---|
91 | } |
---|
92 | } |
---|
93 | } |
---|
94 | } |
---|
95 | //if (Ntheta>1) norm = norm/(M_PI/2); |
---|
96 | if (vol != REAL(0.0) && norm_vol != REAL(0.0)) { |
---|
97 | ret *= norm_vol/vol; |
---|
98 | } |
---|
99 | result[i] = scale*ret/norm+background; |
---|
100 | } |
---|
101 | } |
---|