1 | |
---|
2 | /* |
---|
3 | ########################################################## |
---|
4 | # # |
---|
5 | # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! # |
---|
6 | # !! !! # |
---|
7 | # !! KEEP THIS CODE CONSISTENT WITH KERNELPY.PY !! # |
---|
8 | # !! !! # |
---|
9 | # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! # |
---|
10 | # # |
---|
11 | ########################################################## |
---|
12 | */ |
---|
13 | |
---|
14 | /* |
---|
15 | The environment needs to provide the following #defines |
---|
16 | USE_OPENCL is defined if running in opencl |
---|
17 | KERNEL declares a function to be available externally |
---|
18 | KERNEL_NAME is the name of the function being declared |
---|
19 | NPARS is the number of parameters in the kernel |
---|
20 | PARAMETER_TABLE is the declaration of the parameters to the kernel. |
---|
21 | |
---|
22 | Cylinder: |
---|
23 | |
---|
24 | #define PARAMETER_TABLE \ |
---|
25 | double length; \ |
---|
26 | double radius; \ |
---|
27 | double sld; \ |
---|
28 | double sld_solvent |
---|
29 | |
---|
30 | Multi-shell cylinder (10 shell max): |
---|
31 | |
---|
32 | #define PARAMETER_DECL \ |
---|
33 | double num_shells; \ |
---|
34 | double length; \ |
---|
35 | double radius[10]; \ |
---|
36 | double sld[10]; \ |
---|
37 | double sld_solvent |
---|
38 | |
---|
39 | PARAMETER_CALL(var) is the declaration of a call to the kernel. |
---|
40 | |
---|
41 | Cylinder: |
---|
42 | |
---|
43 | #define PARAMETER_CALL(var) \ |
---|
44 | var.length, \ |
---|
45 | var.radius, \ |
---|
46 | var.sld, \ |
---|
47 | var.sld_solvent |
---|
48 | |
---|
49 | Multi-shell cylinder: |
---|
50 | #define PARAMETER_CALL(var) \ |
---|
51 | var.num_shells, \ |
---|
52 | var.length, \ |
---|
53 | var.radius, \ |
---|
54 | var.sld, \ |
---|
55 | var.sld_solvent |
---|
56 | |
---|
57 | INVALID is a test for model parameters in the correct range |
---|
58 | |
---|
59 | Cylinder: |
---|
60 | |
---|
61 | #define INVALID(var) 0 |
---|
62 | |
---|
63 | BarBell: |
---|
64 | |
---|
65 | #define INVALID(var) (var.bell_radius > var.radius) |
---|
66 | |
---|
67 | Our design supports a limited number of polydispersity loops, wherein |
---|
68 | we need to cycle through the values of the polydispersity, calculate |
---|
69 | the I(q, p) for each combination of parameters, and perform a normalized |
---|
70 | weighted sum across all the weights. Parameters may be passed to the |
---|
71 | underlying calculation engine as scalars or vectors, but the polydispersity |
---|
72 | calculator treats the parameter set as one long vector. |
---|
73 | |
---|
74 | Let's assume we have 6 polydisperse parameters |
---|
75 | 0 : sld_solvent {s1 = constant} |
---|
76 | 1: sld_material {s2 = constant} |
---|
77 | 2: radius {r = vector of 10pts} |
---|
78 | 3: lenghth {l = vector of 30pts} |
---|
79 | 4: background {b = constant} |
---|
80 | 5: scale {s = constant} |
---|
81 | |
---|
82 | The polydisperse parameters are stored in as an array of parameter |
---|
83 | indices, one for each polydisperse parameter, stored in pd_par[n]. |
---|
84 | For the above mentioned expample that will give pd_par = {3, 2, x, x}, |
---|
85 | where x stands for abitrary number and 3 corresponds to longest parameter (lenght). |
---|
86 | Non-polydisperse parameters do not appear in this array. Each polydisperse |
---|
87 | parameter has a weight vector whose length is stored in pd_length[n], |
---|
88 | In our case pd_length = {30,10,0,0}. The weights are stored in |
---|
89 | a contiguous vector of weights for all parameters, with the starting position |
---|
90 | for the each parameter stored in pd_offset[n] (pd_offset = {10,0,x,x}. |
---|
91 | The values corresponding to the weights are stored together in a |
---|
92 | separate weights[] vector, with offset stored in par_offset[pd_par[n]]. |
---|
93 | In the above mentioned example weight = {r0, .., r9, l0, .., l29}. |
---|
94 | Polydisperse parameters should be stored in decreasing order of length |
---|
95 | for highest efficiency. |
---|
96 | |
---|
97 | We limit the number of polydisperse dimensions to MAX_PD (currently 4). |
---|
98 | This cuts the size of the structure in half compared to allowing a |
---|
99 | separate polydispersity for each parameter. This will help a little |
---|
100 | bit for models with large numbers of parameters, such as the onion model. |
---|
101 | |
---|
102 | Parameters may be coordinated. That is, we may have the value of one |
---|
103 | parameter depend on a set of other parameters, some of which may be |
---|
104 | polydisperse. For example, if sld is inversely proportional to the |
---|
105 | volume of a cylinder, and the length and radius are independently |
---|
106 | polydisperse, then for each combination of length and radius we need a |
---|
107 | separate value for the sld. The caller must provide a coordination table |
---|
108 | for each parameter containing the value for each parameter given the |
---|
109 | value of the polydisperse parameters v1, v2, etc. The tables for each |
---|
110 | parameter are arranged contiguously in a vector, with offset[k] giving the |
---|
111 | starting location of parameter k in the vector. Each parameter defines |
---|
112 | coord[k] as a bit mask indicating which polydispersity parameters the |
---|
113 | parameter depends upon. Usually this is zero, indicating that the parameter |
---|
114 | is independent, but for the cylinder example given, the bits for the |
---|
115 | radius and length polydispersity parameters would both be set, the result |
---|
116 | being a (#radius x #length) table, or maybe a (#length x #radius) table |
---|
117 | if length comes first in the polydispersity table. |
---|
118 | |
---|
119 | NB: If we can guarantee that a compiler and OpenCL driver are available, |
---|
120 | we could instead create the coordination function on the fly for each |
---|
121 | parameter, saving memory and transfer time, but requiring a C compiler |
---|
122 | as part of the environment. |
---|
123 | |
---|
124 | In ordering the polydisperse parameters by decreasing length we can |
---|
125 | iterate over the longest dispersion weight vector first. All parameters |
---|
126 | coordinated with this weight vector (the 'fast' parameters), can be |
---|
127 | updated with a simple increment to the next position in the parameter |
---|
128 | value table. The indices of these parameters is stored in fast_coord_index[], |
---|
129 | with fast_coord_count being the number of fast parameters. A total |
---|
130 | of NPARS slots is allocated to allow for the case that all parameters |
---|
131 | are coordinated with the fast index, though this will likely be mostly |
---|
132 | empty. When the fast increment count reaches the end of the weight |
---|
133 | vector, then the index of the second polydisperse parameter must be |
---|
134 | incremented, and all of its coordinated parameters updated. Because this |
---|
135 | operation is not in the inner loop, a slower algorithm can be used. |
---|
136 | |
---|
137 | If there is no polydispersity we pretend that it is polydisperisty with one |
---|
138 | parameter. |
---|
139 | |
---|
140 | The problem details structure can be allocated and sent in as an integer |
---|
141 | array using the read-only flag. This allows us to copy it once per fit |
---|
142 | along with the weights vector, since features such as the number of |
---|
143 | polydisperity elements per pd parameter or the coordinated won't change |
---|
144 | between function evaluations. A new parameter vector is sent for |
---|
145 | each I(q) evaluation. |
---|
146 | |
---|
147 | To protect against expensive evaluations taking all the GPU resource |
---|
148 | on large fits, the entire polydispersity will not be computed at once. |
---|
149 | Instead, a start and stop location will be sent, indicating where in the |
---|
150 | polydispersity loop the calculation should start and where it should |
---|
151 | stop. We can do this for arbitrary start/stop points since we have |
---|
152 | unwound the nested loop. Instead, we use the same technique as array |
---|
153 | index translation, using div and mod to figure out the i,j,k,... |
---|
154 | indices in the virtual nested loop. |
---|
155 | |
---|
156 | The results array will be initialized to zero for polydispersity loop |
---|
157 | entry zero, and preserved between calls to [start, stop] so that the |
---|
158 | results accumulate by the time the loop has completed. Background and |
---|
159 | scale will be applied when the loop reaches the end. This does require |
---|
160 | that the results array be allocated read-write, which is less efficient |
---|
161 | for the GPU, but it makes the calling sequence much more manageable. |
---|
162 | |
---|
163 | Scale and background cannot be coordinated with other polydisperse parameters |
---|
164 | |
---|
165 | TODO: cutoff |
---|
166 | */ |
---|
167 | |
---|
168 | #define MAX_PD 4 // MAX_PD is the max number of polydisperse parameters |
---|
169 | #define PD_2N 16 // PD_2N is the size of the coordination step table |
---|
170 | |
---|
171 | typedef struct { |
---|
172 | int pd_par[MAX_PD]; // index of the nth polydispersity variable |
---|
173 | int pd_length[MAX_PD]; // length of the nth polydispersity weight vector |
---|
174 | int pd_offset[MAX_PD]; // offset of pd weights in the par & weight vector |
---|
175 | int pd_stride[MAX_PD]; // stride to move to the next index at this level |
---|
176 | int par_offset[NPARS]; // offset of par values in the par & weight vector |
---|
177 | int par_coord[NPARS]; // polydispersity coordination bitvector |
---|
178 | int fast_coord_count; // number of parameters coordinated with pd 1 |
---|
179 | int fast_coord_index[NPARS]; // index of the fast coordination parameters |
---|
180 | } ProblemDetails; |
---|
181 | |
---|
182 | typedef struct { |
---|
183 | PARAMETER_DECL; |
---|
184 | } ParameterBlock; |
---|
185 | |
---|
186 | |
---|
187 | KERNEL |
---|
188 | void KERNEL_NAME( |
---|
189 | int nq, // number of q values |
---|
190 | global const ProblemDetails *problem, |
---|
191 | global const double *weights, |
---|
192 | global const double *pars, |
---|
193 | global const double *q, // nq q values, with padding to boundary |
---|
194 | global double *result, // nq return values, again with padding |
---|
195 | global int *offset, // nq+padding space for current parameter index |
---|
196 | const double cutoff, // cutoff in the polydispersity weight product |
---|
197 | const int pd_start, // where we are in the polydispersity loop |
---|
198 | const int pd_stop, // where we are stopping in the polydispersity loop |
---|
199 | ) |
---|
200 | { |
---|
201 | |
---|
202 | // Storage for the current parameter values. These will be updated as we |
---|
203 | // walk the polydispersity cube. |
---|
204 | local ParameterBlock local_pars; |
---|
205 | const double *parvec = &local_pars; // Alias named parameters with a vector |
---|
206 | |
---|
207 | #if defined(USE_SHORTCUT_OPTIMIZATION) |
---|
208 | if (pd_length[0] == 1) { |
---|
209 | // Shouldn't need to copy!! |
---|
210 | for (int k=0; k < NPARS; k++) { |
---|
211 | parvec[k] = pars[k]; |
---|
212 | } |
---|
213 | |
---|
214 | #ifdef USE_OPENMP |
---|
215 | #pragma omp parallel for |
---|
216 | #endif |
---|
217 | for (int i=0; i < nq; i++) { |
---|
218 | { |
---|
219 | const double scattering = IQ_FUNC(IQ_PARS, IQ_PARAMETERS); |
---|
220 | result[i] += local_pars.scale*scattering + local_pars.background; |
---|
221 | } |
---|
222 | return; |
---|
223 | } |
---|
224 | #endif |
---|
225 | |
---|
226 | |
---|
227 | // Since we are no longer looping over the entire polydispersity hypercube |
---|
228 | // for each q, we need to track the normalization values for each q in a |
---|
229 | // separate work vector. |
---|
230 | double norm; // contains sum over weights |
---|
231 | double vol; // contains sum over volume |
---|
232 | double norm_vol; // contains weights over volume |
---|
233 | |
---|
234 | // Initialize the results to zero |
---|
235 | if (pd_start == 0) { |
---|
236 | norm_vol = 0.0; |
---|
237 | norm = 0.0; |
---|
238 | vol = 0.0; |
---|
239 | |
---|
240 | #ifdef USE_OPENMP |
---|
241 | #pragma omp parallel for |
---|
242 | #endif |
---|
243 | for (int i=0; i < nq; i++) { |
---|
244 | result[i] = 0.0; |
---|
245 | } |
---|
246 | } else { |
---|
247 | //Pulling values from previous segment |
---|
248 | norm = result[nq]; |
---|
249 | vol = result[nq+1]; |
---|
250 | norm_vol = results[nq+2]; |
---|
251 | } |
---|
252 | |
---|
253 | // Location in the polydispersity hypercube, one index per dimension. |
---|
254 | local int pd_index[PD_MAX]; |
---|
255 | |
---|
256 | // Set the initial index greater than its vector length in order to |
---|
257 | // trigger the reset behaviour that happens at the end the fast loop. |
---|
258 | pd_index[0] = pd_length[0]; |
---|
259 | |
---|
260 | // need product of weights at every Iq calc, so keep product of |
---|
261 | // weights from the outer loops so that weight = partial_weight * fast_weight |
---|
262 | double partial_weight = NAN; // product of weight w4*w3*w2 but not w1 |
---|
263 | |
---|
264 | // Loop over the weights then loop over q, accumulating values |
---|
265 | for (int loop_index=pd_start; loop_index < pd_stop; loop_index++) { |
---|
266 | // check if indices need to be updated |
---|
267 | if (pd_index[0] >= pd_length[0]) { |
---|
268 | pd_index[0] = loop_index%pd_length[0]; |
---|
269 | partial_weight = 1.0; |
---|
270 | for (int k=1; k < MAX_PD; k++) { |
---|
271 | pd_index[k] = (loop_index%pd_length[k])/pd_stride[k]; |
---|
272 | partial_weight *= weights[pd_offset[k]+pd_index[k]]; |
---|
273 | } |
---|
274 | weight = partial_weight * weights[pd_offset[0]+pd_index[0]] |
---|
275 | for (int k=0; k < NPARS; k++) { |
---|
276 | int coord = par_coord[k]; |
---|
277 | int this_offset = par_offset[k]; |
---|
278 | int block_size = 1; |
---|
279 | for (int bit=0; bit < MAX_PD && coord != 0; bit++) { |
---|
280 | if (coord&1) { |
---|
281 | this_offset += block_size * pd_index[bit]; |
---|
282 | block_size *= pd_length[bit]; |
---|
283 | } |
---|
284 | coord /= 2; |
---|
285 | } |
---|
286 | offset[k] = this_offset; |
---|
287 | parvec[k] = pars[this_offset]; |
---|
288 | } |
---|
289 | } else { |
---|
290 | pd_index[0] += 1; |
---|
291 | weight = partial_weight*weights[pd_offset[0]+pd_index[0]]; |
---|
292 | for (int k=0; k < problem->fast_coord_count; k++) { |
---|
293 | parvec[ fast_coord_index[k]] |
---|
294 | = pars[offset[fast_coord_index[k]] + pd_index[0]]; |
---|
295 | } |
---|
296 | } |
---|
297 | #ifdef INVALID |
---|
298 | if (INVALID(local_pars)) continue; |
---|
299 | #endif |
---|
300 | if (weight > cutoff) { |
---|
301 | const double vol_weight = VOLUME_WEIGHT_PRODUCT; |
---|
302 | const double weighted_vol = vol_weight*form_volume(VOLUME_PARAMETERS); |
---|
303 | norm += weight; |
---|
304 | vol += weighted_vol; |
---|
305 | norm_vol += vol_weight; |
---|
306 | |
---|
307 | #ifdef USE_OPENMP |
---|
308 | #pragma omp parallel for |
---|
309 | #endif |
---|
310 | for (int i=0; i < nq; i++) { |
---|
311 | { |
---|
312 | const double scattering = IQ_FUNC(IQ_PARS, IQ_PARAMETERS); |
---|
313 | //const double scattering = Iq(q[i], IQ_PARAMETERS); |
---|
314 | result[i] += weight*scattering; |
---|
315 | } |
---|
316 | } |
---|
317 | //Makes a normalization avialable for the next round |
---|
318 | result[nq] = norm; |
---|
319 | result[nq+1] = vol; |
---|
320 | results[nq+2] = norm_vol; |
---|
321 | |
---|
322 | //End of the PD loop we can normalize |
---|
323 | if (pd_stop == pd_stride[MAX_PD-1]) {} |
---|
324 | #ifdef USE_OPENMP |
---|
325 | #pragma omp parallel for |
---|
326 | #endif |
---|
327 | for (int i=0; i < nq; i++) { |
---|
328 | if (vol*norm_vol != 0.0) { |
---|
329 | result[i] *= norm_vol/vol; |
---|
330 | } |
---|
331 | result[i] = local_pars.scale*result[i]/norm+local_pars.background; |
---|
332 | } |
---|
333 | } |
---|
334 | } |
---|
335 | } |
---|