[2e44ac7] | 1 | |
---|
| 2 | /* |
---|
| 3 | ########################################################## |
---|
| 4 | # # |
---|
| 5 | # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! # |
---|
| 6 | # !! !! # |
---|
| 7 | # !! KEEP THIS CODE CONSISTENT WITH KERNELPY.PY !! # |
---|
| 8 | # !! !! # |
---|
| 9 | # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! # |
---|
| 10 | # # |
---|
| 11 | ########################################################## |
---|
| 12 | */ |
---|
| 13 | |
---|
| 14 | /* |
---|
[f9245d4] | 15 | The environment needs to provide the following #defines: |
---|
| 16 | |
---|
[2e44ac7] | 17 | USE_OPENCL is defined if running in opencl |
---|
| 18 | KERNEL declares a function to be available externally |
---|
| 19 | KERNEL_NAME is the name of the function being declared |
---|
| 20 | NPARS is the number of parameters in the kernel |
---|
[f9245d4] | 21 | PARAMETER_DECL is the declaration of the parameters to the kernel. |
---|
[2e44ac7] | 22 | |
---|
| 23 | Cylinder: |
---|
| 24 | |
---|
[f9245d4] | 25 | #define PARAMETER_DECL \ |
---|
[2e44ac7] | 26 | double length; \ |
---|
| 27 | double radius; \ |
---|
| 28 | double sld; \ |
---|
| 29 | double sld_solvent |
---|
| 30 | |
---|
[f9245d4] | 31 | Note: scale and background are not included |
---|
| 32 | |
---|
[2e44ac7] | 33 | Multi-shell cylinder (10 shell max): |
---|
| 34 | |
---|
| 35 | #define PARAMETER_DECL \ |
---|
| 36 | double num_shells; \ |
---|
| 37 | double length; \ |
---|
| 38 | double radius[10]; \ |
---|
| 39 | double sld[10]; \ |
---|
| 40 | double sld_solvent |
---|
| 41 | |
---|
| 42 | PARAMETER_CALL(var) is the declaration of a call to the kernel. |
---|
| 43 | |
---|
| 44 | Cylinder: |
---|
| 45 | |
---|
| 46 | #define PARAMETER_CALL(var) \ |
---|
| 47 | var.length, \ |
---|
| 48 | var.radius, \ |
---|
| 49 | var.sld, \ |
---|
| 50 | var.sld_solvent |
---|
| 51 | |
---|
| 52 | Multi-shell cylinder: |
---|
| 53 | #define PARAMETER_CALL(var) \ |
---|
| 54 | var.num_shells, \ |
---|
| 55 | var.length, \ |
---|
| 56 | var.radius, \ |
---|
| 57 | var.sld, \ |
---|
| 58 | var.sld_solvent |
---|
| 59 | |
---|
[3044216] | 60 | INVALID is a test for model parameters in the correct range |
---|
| 61 | |
---|
| 62 | Cylinder: |
---|
| 63 | |
---|
| 64 | #define INVALID(var) 0 |
---|
| 65 | |
---|
| 66 | BarBell: |
---|
| 67 | |
---|
| 68 | #define INVALID(var) (var.bell_radius > var.radius) |
---|
| 69 | |
---|
[f9245d4] | 70 | Model with complicated constraints: |
---|
| 71 | |
---|
| 72 | inline bool constrained(p1, p2, p3) { return expression; } |
---|
| 73 | #define INVALID(var) constrained(var.p1, var.p2, var.p3) |
---|
| 74 | |
---|
| 75 | IQ_FUNC could be Iq or Iqxy |
---|
| 76 | IQ_PARS could be q[i] or q[2*i],q[2*i+1] |
---|
| 77 | |
---|
[2e44ac7] | 78 | Our design supports a limited number of polydispersity loops, wherein |
---|
| 79 | we need to cycle through the values of the polydispersity, calculate |
---|
| 80 | the I(q, p) for each combination of parameters, and perform a normalized |
---|
| 81 | weighted sum across all the weights. Parameters may be passed to the |
---|
| 82 | underlying calculation engine as scalars or vectors, but the polydispersity |
---|
| 83 | calculator treats the parameter set as one long vector. |
---|
| 84 | |
---|
[f9245d4] | 85 | Let's assume we have 6 parameters in the model, with two polydisperse:: |
---|
| 86 | |
---|
| 87 | 0: scale {scl = constant} |
---|
| 88 | 1: background {bkg = constant} |
---|
| 89 | 5: length {l = vector of 30pts} |
---|
| 90 | 4: radius {r = vector of 10pts} |
---|
| 91 | 3: sld {s = constant/(radius**2*length)} |
---|
| 92 | 2: sld_solvent {s2 = constant} |
---|
| 93 | |
---|
| 94 | This generates the following call to the kernel (where x stands for an |
---|
| 95 | arbitrary value that is not used by the kernel evaluator): |
---|
| 96 | |
---|
| 97 | NPARS = 4 // scale and background are in all models |
---|
| 98 | problem { |
---|
[f78a2a1] | 99 | pd_par = {5, 4, x, x} // parameters *radius* and *length* vary |
---|
| 100 | pd_length = {30, 10, 0, 0} // *length* has more, so it is first |
---|
| 101 | pd_offset = {10, 0, x, x} // *length* starts at index 10 in weights |
---|
| 102 | pd_stride = {1, 30, 300, 300} // cumulative product of pd length |
---|
| 103 | pd_isvol = {1, 1, x, x} // true if weight is a volume weight |
---|
[f9245d4] | 104 | par_offset = {2, 3, 303, 313} // parameter offsets |
---|
| 105 | par_coord = {0, 3, 2, 1} // bitmap of parameter dependencies |
---|
| 106 | fast_coord_count = 2 // two parameters vary with *length* distribution |
---|
| 107 | fast_coord_index = {5, 3, x, x, x, x} |
---|
| 108 | } |
---|
| 109 | |
---|
| 110 | weight = { l0, .., l29, r0, .., r9} |
---|
| 111 | pars = { scl, bkg, l0, ..., l29, r0, r1, ..., r9, |
---|
| 112 | s[l0,r0], ... s[l0,r9], s[l1,r0], ... s[l29,r9] , s2} |
---|
| 113 | |
---|
| 114 | nq = 130 |
---|
| 115 | q = { q0, q1, ..., q130, x, x } # pad to 8 element boundary |
---|
| 116 | result = {r1, ..., r130, norm, vol, vol_norm, x, x, x, x, x, x, x} |
---|
| 117 | |
---|
[44f39a2] | 118 | |
---|
[2e44ac7] | 119 | The polydisperse parameters are stored in as an array of parameter |
---|
| 120 | indices, one for each polydisperse parameter, stored in pd_par[n]. |
---|
| 121 | Non-polydisperse parameters do not appear in this array. Each polydisperse |
---|
[44f39a2] | 122 | parameter has a weight vector whose length is stored in pd_length[n], |
---|
[f9245d4] | 123 | The weights are stored in a contiguous vector of weights for all |
---|
| 124 | parameters, with the starting position for the each parameter stored |
---|
| 125 | in pd_offset[n]. The values corresponding to the weights are stored |
---|
| 126 | together in a separate weights[] vector, with offset stored in |
---|
| 127 | par_offset[pd_par[n]]. Polydisperse parameters should be stored in |
---|
| 128 | decreasing order of length for highest efficiency. |
---|
[2e44ac7] | 129 | |
---|
| 130 | We limit the number of polydisperse dimensions to MAX_PD (currently 4). |
---|
| 131 | This cuts the size of the structure in half compared to allowing a |
---|
| 132 | separate polydispersity for each parameter. This will help a little |
---|
| 133 | bit for models with large numbers of parameters, such as the onion model. |
---|
| 134 | |
---|
| 135 | Parameters may be coordinated. That is, we may have the value of one |
---|
| 136 | parameter depend on a set of other parameters, some of which may be |
---|
| 137 | polydisperse. For example, if sld is inversely proportional to the |
---|
| 138 | volume of a cylinder, and the length and radius are independently |
---|
| 139 | polydisperse, then for each combination of length and radius we need a |
---|
| 140 | separate value for the sld. The caller must provide a coordination table |
---|
| 141 | for each parameter containing the value for each parameter given the |
---|
[44f39a2] | 142 | value of the polydisperse parameters v1, v2, etc. The tables for each |
---|
[2e44ac7] | 143 | parameter are arranged contiguously in a vector, with offset[k] giving the |
---|
| 144 | starting location of parameter k in the vector. Each parameter defines |
---|
| 145 | coord[k] as a bit mask indicating which polydispersity parameters the |
---|
[a10da8b] | 146 | parameter depends upon. Usually this is zero, indicating that the parameter |
---|
[2e44ac7] | 147 | is independent, but for the cylinder example given, the bits for the |
---|
| 148 | radius and length polydispersity parameters would both be set, the result |
---|
| 149 | being a (#radius x #length) table, or maybe a (#length x #radius) table |
---|
| 150 | if length comes first in the polydispersity table. |
---|
| 151 | |
---|
| 152 | NB: If we can guarantee that a compiler and OpenCL driver are available, |
---|
| 153 | we could instead create the coordination function on the fly for each |
---|
| 154 | parameter, saving memory and transfer time, but requiring a C compiler |
---|
| 155 | as part of the environment. |
---|
| 156 | |
---|
| 157 | In ordering the polydisperse parameters by decreasing length we can |
---|
| 158 | iterate over the longest dispersion weight vector first. All parameters |
---|
| 159 | coordinated with this weight vector (the 'fast' parameters), can be |
---|
| 160 | updated with a simple increment to the next position in the parameter |
---|
| 161 | value table. The indices of these parameters is stored in fast_coord_index[], |
---|
| 162 | with fast_coord_count being the number of fast parameters. A total |
---|
| 163 | of NPARS slots is allocated to allow for the case that all parameters |
---|
| 164 | are coordinated with the fast index, though this will likely be mostly |
---|
| 165 | empty. When the fast increment count reaches the end of the weight |
---|
| 166 | vector, then the index of the second polydisperse parameter must be |
---|
| 167 | incremented, and all of its coordinated parameters updated. Because this |
---|
| 168 | operation is not in the inner loop, a slower algorithm can be used. |
---|
| 169 | |
---|
[3044216] | 170 | If there is no polydispersity we pretend that it is polydisperisty with one |
---|
[f9245d4] | 171 | parameter, pd_start=0 and pd_stop=1. We may or may not short circuit the |
---|
| 172 | calculation in this case, depending on how much time it saves. |
---|
[3044216] | 173 | |
---|
[2e44ac7] | 174 | The problem details structure can be allocated and sent in as an integer |
---|
| 175 | array using the read-only flag. This allows us to copy it once per fit |
---|
| 176 | along with the weights vector, since features such as the number of |
---|
| 177 | polydisperity elements per pd parameter or the coordinated won't change |
---|
| 178 | between function evaluations. A new parameter vector is sent for |
---|
| 179 | each I(q) evaluation. |
---|
| 180 | |
---|
| 181 | To protect against expensive evaluations taking all the GPU resource |
---|
| 182 | on large fits, the entire polydispersity will not be computed at once. |
---|
| 183 | Instead, a start and stop location will be sent, indicating where in the |
---|
| 184 | polydispersity loop the calculation should start and where it should |
---|
| 185 | stop. We can do this for arbitrary start/stop points since we have |
---|
| 186 | unwound the nested loop. Instead, we use the same technique as array |
---|
| 187 | index translation, using div and mod to figure out the i,j,k,... |
---|
| 188 | indices in the virtual nested loop. |
---|
| 189 | |
---|
| 190 | The results array will be initialized to zero for polydispersity loop |
---|
| 191 | entry zero, and preserved between calls to [start, stop] so that the |
---|
| 192 | results accumulate by the time the loop has completed. Background and |
---|
| 193 | scale will be applied when the loop reaches the end. This does require |
---|
| 194 | that the results array be allocated read-write, which is less efficient |
---|
| 195 | for the GPU, but it makes the calling sequence much more manageable. |
---|
[3044216] | 196 | |
---|
[0cbcec4] | 197 | Scale and background cannot be coordinated with other polydisperse parameters |
---|
| 198 | |
---|
[9c490bc] | 199 | Cutoff paramater is basically used to restrict the region where integration |
---|
| 200 | is peformed i.e. polydispersity hypercude is limitted spheres. |
---|
| 201 | |
---|
[2e44ac7] | 202 | */ |
---|
| 203 | |
---|
| 204 | #define MAX_PD 4 // MAX_PD is the max number of polydisperse parameters |
---|
| 205 | #define PD_2N 16 // PD_2N is the size of the coordination step table |
---|
| 206 | |
---|
| 207 | typedef struct { |
---|
| 208 | int pd_par[MAX_PD]; // index of the nth polydispersity variable |
---|
| 209 | int pd_length[MAX_PD]; // length of the nth polydispersity weight vector |
---|
| 210 | int pd_offset[MAX_PD]; // offset of pd weights in the par & weight vector |
---|
| 211 | int pd_stride[MAX_PD]; // stride to move to the next index at this level |
---|
[f78a2a1] | 212 | int pd_isvol[MAX_PD]; // True if parameter is a volume weighting parameter |
---|
[2e44ac7] | 213 | int par_offset[NPARS]; // offset of par values in the par & weight vector |
---|
| 214 | int par_coord[NPARS]; // polydispersity coordination bitvector |
---|
| 215 | int fast_coord_count; // number of parameters coordinated with pd 1 |
---|
| 216 | int fast_coord_index[NPARS]; // index of the fast coordination parameters |
---|
| 217 | } ProblemDetails; |
---|
| 218 | |
---|
| 219 | typedef struct { |
---|
| 220 | PARAMETER_DECL; |
---|
| 221 | } ParameterBlock; |
---|
| 222 | |
---|
[9c490bc] | 223 | #define KERNEL_NAME test_Iq |
---|
| 224 | #define FULL_KERNEL_NAME test_Iq |
---|
| 225 | #define IQ_FUNC Iq |
---|
| 226 | |
---|
| 227 | #define IQ_PARAMETERS ignored |
---|
| 228 | #define IQ_FIXED_PARAMETER_DECLARATIONS const double scale, \ |
---|
| 229 | const double background, \ |
---|
| 230 | const double ignored |
---|
| 231 | #define IQ_PARAMETER_DECLARATIONS double ignored |
---|
| 232 | #define IQXY_KERNEL_NAME bessel_Iqxy |
---|
| 233 | #define IQXY_PARAMETERS ignored |
---|
| 234 | #define IQXY_FIXED_PARAMETER_DECLARATIONS const double scale, \ |
---|
| 235 | const double background, \ |
---|
| 236 | const double ignored |
---|
| 237 | #define IQXY_PARAMETER_DECLARATIONS double ignored |
---|
| 238 | |
---|
| 239 | |
---|
[f9245d4] | 240 | void FULL_KERNEL_NAME( |
---|
[2e44ac7] | 241 | int nq, // number of q values |
---|
| 242 | global const ProblemDetails *problem, |
---|
| 243 | global const double *weights, |
---|
| 244 | global const double *pars, |
---|
| 245 | global const double *q, // nq q values, with padding to boundary |
---|
| 246 | global double *result, // nq return values, again with padding |
---|
| 247 | const double cutoff, // cutoff in the polydispersity weight product |
---|
| 248 | const int pd_start, // where we are in the polydispersity loop |
---|
| 249 | const int pd_stop, // where we are stopping in the polydispersity loop |
---|
| 250 | ) |
---|
| 251 | { |
---|
[3044216] | 252 | |
---|
[10ddb64] | 253 | // Storage for the current parameter values. These will be updated as we |
---|
| 254 | // walk the polydispersity cube. |
---|
[f9245d4] | 255 | local ParameterBlock local_pars; // current parameter values |
---|
[2e44ac7] | 256 | const double *parvec = &local_pars; // Alias named parameters with a vector |
---|
| 257 | |
---|
[f9245d4] | 258 | local int offset[NPARS-2]; |
---|
| 259 | |
---|
[3044216] | 260 | #if defined(USE_SHORTCUT_OPTIMIZATION) |
---|
| 261 | if (pd_length[0] == 1) { |
---|
| 262 | // Shouldn't need to copy!! |
---|
| 263 | for (int k=0; k < NPARS; k++) { |
---|
[f9245d4] | 264 | parvec[k] = pars[k+2]; // skip scale and background |
---|
[3044216] | 265 | } |
---|
| 266 | |
---|
| 267 | #ifdef USE_OPENMP |
---|
| 268 | #pragma omp parallel for |
---|
| 269 | #endif |
---|
| 270 | for (int i=0; i < nq; i++) { |
---|
| 271 | { |
---|
| 272 | const double scattering = IQ_FUNC(IQ_PARS, IQ_PARAMETERS); |
---|
[f9245d4] | 273 | result[i] += pars[0]*scattering + pars[1]; |
---|
[3044216] | 274 | } |
---|
| 275 | return; |
---|
| 276 | } |
---|
| 277 | #endif |
---|
| 278 | |
---|
| 279 | |
---|
[10ddb64] | 280 | // Since we are no longer looping over the entire polydispersity hypercube |
---|
| 281 | // for each q, we need to track the normalization values for each q in a |
---|
| 282 | // separate work vector. |
---|
[3044216] | 283 | double norm; // contains sum over weights |
---|
| 284 | double vol; // contains sum over volume |
---|
| 285 | double norm_vol; // contains weights over volume |
---|
[10ddb64] | 286 | |
---|
[2e44ac7] | 287 | // Initialize the results to zero |
---|
| 288 | if (pd_start == 0) { |
---|
[3044216] | 289 | norm_vol = 0.0; |
---|
| 290 | norm = 0.0; |
---|
| 291 | vol = 0.0; |
---|
| 292 | |
---|
[2e44ac7] | 293 | #ifdef USE_OPENMP |
---|
| 294 | #pragma omp parallel for |
---|
| 295 | #endif |
---|
[3044216] | 296 | for (int i=0; i < nq; i++) { |
---|
[2e44ac7] | 297 | result[i] = 0.0; |
---|
| 298 | } |
---|
[3044216] | 299 | } else { |
---|
| 300 | //Pulling values from previous segment |
---|
| 301 | norm = result[nq]; |
---|
| 302 | vol = result[nq+1]; |
---|
| 303 | norm_vol = results[nq+2]; |
---|
[2e44ac7] | 304 | } |
---|
| 305 | |
---|
[3044216] | 306 | // Location in the polydispersity hypercube, one index per dimension. |
---|
[a10da8b] | 307 | local int pd_index[PD_MAX]; |
---|
| 308 | |
---|
[f9245d4] | 309 | // Trigger the reset behaviour that happens at the end the fast loop |
---|
| 310 | // by setting the initial index >= weight vector length. |
---|
[a10da8b] | 311 | pd_index[0] = pd_length[0]; |
---|
[2e44ac7] | 312 | |
---|
[3044216] | 313 | // need product of weights at every Iq calc, so keep product of |
---|
| 314 | // weights from the outer loops so that weight = partial_weight * fast_weight |
---|
| 315 | double partial_weight = NAN; // product of weight w4*w3*w2 but not w1 |
---|
[f78a2a1] | 316 | double partial_volweight = NAN; |
---|
| 317 | double weight = 1.0; // set to 1 in case there are no weights |
---|
| 318 | double vol_weight = 1.0; // set to 1 in case there are no vol weights |
---|
[3044216] | 319 | |
---|
[2e44ac7] | 320 | // Loop over the weights then loop over q, accumulating values |
---|
| 321 | for (int loop_index=pd_start; loop_index < pd_stop; loop_index++) { |
---|
| 322 | // check if indices need to be updated |
---|
| 323 | if (pd_index[0] >= pd_length[0]) { |
---|
| 324 | pd_index[0] = loop_index%pd_length[0]; |
---|
| 325 | partial_weight = 1.0; |
---|
[f78a2a1] | 326 | partial_volweight = 1.0; |
---|
[3044216] | 327 | for (int k=1; k < MAX_PD; k++) { |
---|
[2e44ac7] | 328 | pd_index[k] = (loop_index%pd_length[k])/pd_stride[k]; |
---|
[f78a2a1] | 329 | const double wi = weights[pd_offset[0]+pd_index[0]]; |
---|
| 330 | partial_weight *= wi; |
---|
| 331 | if (pd_isvol[k]) partial_volweight *= wi; |
---|
[2e44ac7] | 332 | } |
---|
| 333 | weight = partial_weight * weights[pd_offset[0]+pd_index[0]] |
---|
| 334 | for (int k=0; k < NPARS; k++) { |
---|
| 335 | int coord = par_coord[k]; |
---|
[3044216] | 336 | int this_offset = par_offset[k]; |
---|
[2e44ac7] | 337 | int block_size = 1; |
---|
| 338 | for (int bit=0; bit < MAX_PD && coord != 0; bit++) { |
---|
| 339 | if (coord&1) { |
---|
| 340 | this_offset += block_size * pd_index[bit]; |
---|
| 341 | block_size *= pd_length[bit]; |
---|
| 342 | } |
---|
[a10da8b] | 343 | coord /= 2; |
---|
[2e44ac7] | 344 | } |
---|
| 345 | offset[k] = this_offset; |
---|
[3044216] | 346 | parvec[k] = pars[this_offset]; |
---|
[2e44ac7] | 347 | } |
---|
| 348 | } else { |
---|
| 349 | pd_index[0] += 1; |
---|
[f78a2a1] | 350 | const double wi = weights[pd_offset[0]+pd_index[0]]; |
---|
| 351 | weight = partial_weight*wi; |
---|
| 352 | if (pd_isvol[0]) vol_weight *= wi; |
---|
[2e44ac7] | 353 | for (int k=0; k < problem->fast_coord_count; k++) { |
---|
| 354 | parvec[ fast_coord_index[k]] |
---|
| 355 | = pars[offset[fast_coord_index[k]] + pd_index[0]]; |
---|
| 356 | } |
---|
| 357 | } |
---|
[3044216] | 358 | #ifdef INVALID |
---|
| 359 | if (INVALID(local_pars)) continue; |
---|
| 360 | #endif |
---|
[10ddb64] | 361 | if (weight > cutoff) { |
---|
[3044216] | 362 | norm += weight; |
---|
[f78a2a1] | 363 | vol += vol_weight * form_volume(VOLUME_PARAMETERS); |
---|
[3044216] | 364 | norm_vol += vol_weight; |
---|
| 365 | |
---|
[10ddb64] | 366 | #ifdef USE_OPENMP |
---|
| 367 | #pragma omp parallel for |
---|
| 368 | #endif |
---|
[3044216] | 369 | for (int i=0; i < nq; i++) { |
---|
[10ddb64] | 370 | { |
---|
[3044216] | 371 | const double scattering = IQ_FUNC(IQ_PARS, IQ_PARAMETERS); |
---|
| 372 | //const double scattering = Iq(q[i], IQ_PARAMETERS); |
---|
| 373 | result[i] += weight*scattering; |
---|
| 374 | } |
---|
[2e44ac7] | 375 | } |
---|
[3044216] | 376 | //Makes a normalization avialable for the next round |
---|
| 377 | result[nq] = norm; |
---|
| 378 | result[nq+1] = vol; |
---|
[f35f1dd] | 379 | result[nq+2] = norm_vol; |
---|
[2e44ac7] | 380 | |
---|
[3044216] | 381 | //End of the PD loop we can normalize |
---|
[2e44ac7] | 382 | if (pd_stop == pd_stride[MAX_PD-1]) {} |
---|
| 383 | #ifdef USE_OPENMP |
---|
| 384 | #pragma omp parallel for |
---|
| 385 | #endif |
---|
[3044216] | 386 | for (int i=0; i < nq; i++) { |
---|
| 387 | if (vol*norm_vol != 0.0) { |
---|
| 388 | result[i] *= norm_vol/vol; |
---|
[2e44ac7] | 389 | } |
---|
[f9245d4] | 390 | result[i] = pars[0]*result[i]/norm + pars[1]; |
---|
[2e44ac7] | 391 | } |
---|
| 392 | } |
---|
| 393 | } |
---|
[a10da8b] | 394 | } |
---|
[9c490bc] | 395 | } |
---|