kernel_iq.c @ f9245d4

core_shell_microgelscostrafo411magnetic_modelrelease_v0.94release_v0.95ticket-1257-vesicle-productticket_1156ticket_1265_superballticket_822_more_unit_tests

Last change on this file since f9245d4 was f9245d4, checked in by Paul Kienzle <pkienzle@…>, 8 years ago
update design, noting that bkg/scale are special
Property mode set to `100644`
File size: 14.2 KB

Line
1
2	/*
3	##########################################################
4	# #
5	# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! #
6	# !! !! #
7	# !! KEEP THIS CODE CONSISTENT WITH KERNELPY.PY !! #
8	# !! !! #
9	# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! #
10	# #
11	##########################################################
12	*/
13
14	/*
15	The environment needs to provide the following #defines:
16
17	USE_OPENCL is defined if running in opencl
18	KERNEL declares a function to be available externally
19	KERNEL_NAME is the name of the function being declared
20	NPARS is the number of parameters in the kernel
21	PARAMETER_DECL is the declaration of the parameters to the kernel.
22
23	Cylinder:
24
25	#define PARAMETER_DECL \
26	double length; \
27	double radius; \
28	double sld; \
29	double sld_solvent
30
31	Note: scale and background are not included
32
33	Multi-shell cylinder (10 shell max):
34
35	#define PARAMETER_DECL \
36	double num_shells; \
37	double length; \
38	double radius[10]; \
39	double sld[10]; \
40	double sld_solvent
41
42	PARAMETER_CALL(var) is the declaration of a call to the kernel.
43
44	Cylinder:
45
46	#define PARAMETER_CALL(var) \
47	var.length, \
48	var.radius, \
49	var.sld, \
50	var.sld_solvent
51
52	Multi-shell cylinder:
53	#define PARAMETER_CALL(var) \
54	var.num_shells, \
55	var.length, \
56	var.radius, \
57	var.sld, \
58	var.sld_solvent
59
60	INVALID is a test for model parameters in the correct range
61
62	Cylinder:
63
64	#define INVALID(var) 0
65
66	BarBell:
67
68	#define INVALID(var) (var.bell_radius > var.radius)
69
70	Model with complicated constraints:
71
72	inline bool constrained(p1, p2, p3) { return expression; }
73	#define INVALID(var) constrained(var.p1, var.p2, var.p3)
74
75	IQ_FUNC could be Iq or Iqxy
76	IQ_PARS could be q[i] or q[2i],q[2i+1]
77
78	Our design supports a limited number of polydispersity loops, wherein
79	we need to cycle through the values of the polydispersity, calculate
80	the I(q, p) for each combination of parameters, and perform a normalized
81	weighted sum across all the weights. Parameters may be passed to the
82	underlying calculation engine as scalars or vectors, but the polydispersity
83	calculator treats the parameter set as one long vector.
84
85	Let's assume we have 6 parameters in the model, with two polydisperse::
86
87	0: scale {scl = constant}
88	1: background {bkg = constant}
89	5: length {l = vector of 30pts}
90	4: radius {r = vector of 10pts}
91	3: sld {s = constant/(radius*2length)}
92	2: sld_solvent {s2 = constant}
93
94	This generates the following call to the kernel (where x stands for an
95	arbitrary value that is not used by the kernel evaluator):
96
97	NPARS = 4 // scale and background are in all models
98	problem {
99	pd_par = {5, 4, x, x} // parameters radius and length vary
100	pd_length = {30,10,0,0} // length has more, so it is first
101	pd_offset = {10,0,x,x} // length starts at index 10 in weights
102	pd_stride = {1,30,300,300} // cumulative product of pd length
103	par_offset = {2, 3, 303, 313} // parameter offsets
104	par_coord = {0, 3, 2, 1} // bitmap of parameter dependencies
105	fast_coord_count = 2 // two parameters vary with length distribution
106	fast_coord_index = {5, 3, x, x, x, x}
107	}
108
109	weight = { l0, .., l29, r0, .., r9}
110	pars = { scl, bkg, l0, ..., l29, r0, r1, ..., r9,
111	s[l0,r0], ... s[l0,r9], s[l1,r0], ... s[l29,r9] , s2}
112
113	nq = 130
114	q = { q0, q1, ..., q130, x, x } # pad to 8 element boundary
115	result = {r1, ..., r130, norm, vol, vol_norm, x, x, x, x, x, x, x}
116
117
118	The polydisperse parameters are stored in as an array of parameter
119	indices, one for each polydisperse parameter, stored in pd_par[n].
120	Non-polydisperse parameters do not appear in this array. Each polydisperse
121	parameter has a weight vector whose length is stored in pd_length[n],
122	The weights are stored in a contiguous vector of weights for all
123	parameters, with the starting position for the each parameter stored
124	in pd_offset[n]. The values corresponding to the weights are stored
125	together in a separate weights[] vector, with offset stored in
126	par_offset[pd_par[n]]. Polydisperse parameters should be stored in
127	decreasing order of length for highest efficiency.
128
129	We limit the number of polydisperse dimensions to MAX_PD (currently 4).
130	This cuts the size of the structure in half compared to allowing a
131	separate polydispersity for each parameter. This will help a little
132	bit for models with large numbers of parameters, such as the onion model.
133
134	Parameters may be coordinated. That is, we may have the value of one
135	parameter depend on a set of other parameters, some of which may be
136	polydisperse. For example, if sld is inversely proportional to the
137	volume of a cylinder, and the length and radius are independently
138	polydisperse, then for each combination of length and radius we need a
139	separate value for the sld. The caller must provide a coordination table
140	for each parameter containing the value for each parameter given the
141	value of the polydisperse parameters v1, v2, etc. The tables for each
142	parameter are arranged contiguously in a vector, with offset[k] giving the
143	starting location of parameter k in the vector. Each parameter defines
144	coord[k] as a bit mask indicating which polydispersity parameters the
145	parameter depends upon. Usually this is zero, indicating that the parameter
146	is independent, but for the cylinder example given, the bits for the
147	radius and length polydispersity parameters would both be set, the result
148	being a (#radius x #length) table, or maybe a (#length x #radius) table
149	if length comes first in the polydispersity table.
150
151	NB: If we can guarantee that a compiler and OpenCL driver are available,
152	we could instead create the coordination function on the fly for each
153	parameter, saving memory and transfer time, but requiring a C compiler
154	as part of the environment.
155
156	In ordering the polydisperse parameters by decreasing length we can
157	iterate over the longest dispersion weight vector first. All parameters
158	coordinated with this weight vector (the 'fast' parameters), can be
159	updated with a simple increment to the next position in the parameter
160	value table. The indices of these parameters is stored in fast_coord_index[],
161	with fast_coord_count being the number of fast parameters. A total
162	of NPARS slots is allocated to allow for the case that all parameters
163	are coordinated with the fast index, though this will likely be mostly
164	empty. When the fast increment count reaches the end of the weight
165	vector, then the index of the second polydisperse parameter must be
166	incremented, and all of its coordinated parameters updated. Because this
167	operation is not in the inner loop, a slower algorithm can be used.
168
169	If there is no polydispersity we pretend that it is polydisperisty with one
170	parameter, pd_start=0 and pd_stop=1. We may or may not short circuit the
171	calculation in this case, depending on how much time it saves.
172
173	The problem details structure can be allocated and sent in as an integer
174	array using the read-only flag. This allows us to copy it once per fit
175	along with the weights vector, since features such as the number of
176	polydisperity elements per pd parameter or the coordinated won't change
177	between function evaluations. A new parameter vector is sent for
178	each I(q) evaluation.
179
180	To protect against expensive evaluations taking all the GPU resource
181	on large fits, the entire polydispersity will not be computed at once.
182	Instead, a start and stop location will be sent, indicating where in the
183	polydispersity loop the calculation should start and where it should
184	stop. We can do this for arbitrary start/stop points since we have
185	unwound the nested loop. Instead, we use the same technique as array
186	index translation, using div and mod to figure out the i,j,k,...
187	indices in the virtual nested loop.
188
189	The results array will be initialized to zero for polydispersity loop
190	entry zero, and preserved between calls to [start, stop] so that the
191	results accumulate by the time the loop has completed. Background and
192	scale will be applied when the loop reaches the end. This does require
193	that the results array be allocated read-write, which is less efficient
194	for the GPU, but it makes the calling sequence much more manageable.
195
196	Scale and background cannot be coordinated with other polydisperse parameters
197
198	TODO: cutoff
199	*/
200
201	#define MAX_PD 4 // MAX_PD is the max number of polydisperse parameters
202	#define PD_2N 16 // PD_2N is the size of the coordination step table
203
204	typedef struct {
205	int pd_par[MAX_PD]; // index of the nth polydispersity variable
206	int pd_length[MAX_PD]; // length of the nth polydispersity weight vector
207	int pd_offset[MAX_PD]; // offset of pd weights in the par & weight vector
208	int pd_stride[MAX_PD]; // stride to move to the next index at this level
209	int par_offset[NPARS]; // offset of par values in the par & weight vector
210	int par_coord[NPARS]; // polydispersity coordination bitvector
211	int fast_coord_count; // number of parameters coordinated with pd 1
212	int fast_coord_index[NPARS]; // index of the fast coordination parameters
213	} ProblemDetails;
214
215	typedef struct {
216	PARAMETER_DECL;
217	} ParameterBlock;
218
219	#define FULL_KERNEL_NAME KERNEL_NAME ## _ ## IQ_FUNC
220	KERNEL
221	void FULL_KERNEL_NAME(
222	int nq, // number of q values
223	global const ProblemDetails *problem,
224	global const double *weights,
225	global const double *pars,
226	global const double *q, // nq q values, with padding to boundary
227	global double *result, // nq return values, again with padding
228	const double cutoff, // cutoff in the polydispersity weight product
229	const int pd_start, // where we are in the polydispersity loop
230	const int pd_stop, // where we are stopping in the polydispersity loop
231	)
232	{
233
234	// Storage for the current parameter values. These will be updated as we
235	// walk the polydispersity cube.
236	local ParameterBlock local_pars; // current parameter values
237	const double *parvec = &local_pars; // Alias named parameters with a vector
238
239	local int offset[NPARS-2];
240
241	#if defined(USE_SHORTCUT_OPTIMIZATION)
242	if (pd_length[0] == 1) {
243	// Shouldn't need to copy!!
244	for (int k=0; k < NPARS; k++) {
245	parvec[k] = pars[k+2]; // skip scale and background
246	}
247
248	#ifdef USE_OPENMP
249	#pragma omp parallel for
250	#endif
251	for (int i=0; i < nq; i++) {
252	{
253	const double scattering = IQ_FUNC(IQ_PARS, IQ_PARAMETERS);
254	result[i] += pars[0]*scattering + pars[1];
255	}
256	return;
257	}
258	#endif
259
260
261	// Since we are no longer looping over the entire polydispersity hypercube
262	// for each q, we need to track the normalization values for each q in a
263	// separate work vector.
264	double norm; // contains sum over weights
265	double vol; // contains sum over volume
266	double norm_vol; // contains weights over volume
267
268	// Initialize the results to zero
269	if (pd_start == 0) {
270	norm_vol = 0.0;
271	norm = 0.0;
272	vol = 0.0;
273
274	#ifdef USE_OPENMP
275	#pragma omp parallel for
276	#endif
277	for (int i=0; i < nq; i++) {
278	result[i] = 0.0;
279	}
280	} else {
281	//Pulling values from previous segment
282	norm = result[nq];
283	vol = result[nq+1];
284	norm_vol = results[nq+2];
285	}
286
287	// Location in the polydispersity hypercube, one index per dimension.
288	local int pd_index[PD_MAX];
289
290	// Trigger the reset behaviour that happens at the end the fast loop
291	// by setting the initial index >= weight vector length.
292	pd_index[0] = pd_length[0];
293
294	// need product of weights at every Iq calc, so keep product of
295	// weights from the outer loops so that weight = partial_weight * fast_weight
296	double partial_weight = NAN; // product of weight w4w3w2 but not w1
297
298	// Loop over the weights then loop over q, accumulating values
299	for (int loop_index=pd_start; loop_index < pd_stop; loop_index++) {
300	// check if indices need to be updated
301	if (pd_index[0] >= pd_length[0]) {
302	pd_index[0] = loop_index%pd_length[0];
303	partial_weight = 1.0;
304	for (int k=1; k < MAX_PD; k++) {
305	pd_index[k] = (loop_index%pd_length[k])/pd_stride[k];
306	partial_weight *= weights[pd_offset[k]+pd_index[k]];
307	}
308	weight = partial_weight * weights[pd_offset[0]+pd_index[0]]
309	for (int k=0; k < NPARS; k++) {
310	int coord = par_coord[k];
311	int this_offset = par_offset[k];
312	int block_size = 1;
313	for (int bit=0; bit < MAX_PD && coord != 0; bit++) {
314	if (coord&1) {
315	this_offset += block_size * pd_index[bit];
316	block_size *= pd_length[bit];
317	}
318	coord /= 2;
319	}
320	offset[k] = this_offset;
321	parvec[k] = pars[this_offset];
322	}
323	} else {
324	pd_index[0] += 1;
325	weight = partial_weight*weights[pd_offset[0]+pd_index[0]];
326	for (int k=0; k < problem->fast_coord_count; k++) {
327	parvec[ fast_coord_index[k]]
328	= pars[offset[fast_coord_index[k]] + pd_index[0]];
329	}
330	}
331	#ifdef INVALID
332	if (INVALID(local_pars)) continue;
333	#endif
334	if (weight > cutoff) {
335	const double vol_weight = VOLUME_WEIGHT_PRODUCT;
336	const double weighted_vol = vol_weight*form_volume(VOLUME_PARAMETERS);
337	norm += weight;
338	vol += weighted_vol;
339	norm_vol += vol_weight;
340
341	#ifdef USE_OPENMP
342	#pragma omp parallel for
343	#endif
344	for (int i=0; i < nq; i++) {
345	{
346	const double scattering = IQ_FUNC(IQ_PARS, IQ_PARAMETERS);
347	//const double scattering = Iq(q[i], IQ_PARAMETERS);
348	result[i] += weight*scattering;
349	}
350	}
351	//Makes a normalization avialable for the next round
352	result[nq] = norm;
353	result[nq+1] = vol;
354	results[nq+2] = norm_vol;
355
356	//End of the PD loop we can normalize
357	if (pd_stop == pd_stride[MAX_PD-1]) {}
358	#ifdef USE_OPENMP
359	#pragma omp parallel for
360	#endif
361	for (int i=0; i < nq; i++) {
362	if (vol*norm_vol != 0.0) {
363	result[i] *= norm_vol/vol;
364	}
365	result[i] = pars[0]*result[i]/norm + pars[1];
366	}
367	}
368	}
369	}

Note: See TracBrowser for help on using the repository browser.

SasView

source: sasmodels/sasmodels/kernel_iq.c @ f9245d4

Download in other formats: