kernelcl.py @ af1d68c

core_shell_microgelscostrafo411magnetic_modelrelease_v0.94release_v0.95ticket-1257-vesicle-productticket_1156ticket_1265_superballticket_822_more_unit_tests

Last change on this file since af1d68c was af1d68c, checked in by Paul Kienzle <pkienzle@…>, 9 years ago
remove obsolete code
Property mode set to `100644`
File size: 13.3 KB

Line
1	"""
2	GPU support through OpenCL
3
4	There should be a single GPU environment running on the system. This
5	environment is constructed on the first call to :func:`env`, and the
6	same environment is returned on each call.
7
8	After retrieving the environment, the next step is to create the kernel.
9	This is done with a call to :meth:`GpuEnvironment.make_kernel`, which
10	returns the type of data used by the kernel.
11
12	Next a :class:`GpuData` object should be created with the correct kind
13	of data. This data object can be used by multiple kernels, for example,
14	if the target model is a weighted sum of multiple kernels. The data
15	should include any extra evaluation points required to compute the proper
16	data smearing. This need not match the square grid for 2D data if there
17	is an index saying which q points are active.
18
19	Together the GpuData, the program, and a device form a :class:`GpuKernel`.
20	This kernel is used during fitting, receiving new sets of parameters and
21	evaluating them. The output value is stored in an output buffer on the
22	devices, where it can be combined with other structure factors and form
23	factors and have instrumental resolution effects applied.
24	"""
25	import os
26	import warnings
27
28	import numpy as np
29
30	try:
31	import pyopencl as cl
32	# Ask OpenCL for the default context so that we know that one exists
33	cl.create_some_context(interactive=False)
34	except Exception, exc:
35	warnings.warn(str(exc))
36	raise RuntimeError("OpenCL not available")
37
38	from pyopencl import mem_flags as mf
39
40	from . import generate
41	from .kernelpy import PyModel
42
43	F64_DEFS = """\
44	#ifdef cl_khr_fp64
45	# pragma OPENCL EXTENSION cl_khr_fp64: enable
46	#endif
47	"""
48
49	# The max loops number is limited by the amount of local memory available
50	# on the device. You don't want to make this value too big because it will
51	# waste resources, nor too small because it may interfere with users trying
52	# to do their polydispersity calculations. A value of 1024 should be much
53	# larger than necessary given that cost grows as npts^k where k is the number
54	# of polydisperse parameters.
55	MAX_LOOPS = 2048
56
57
58	ENV = None
59	def environment():
60	"""
61	Returns a singleton :class:`GpuEnvironment`.
62
63	This provides an OpenCL context and one queue per device.
64	"""
65	global ENV
66	if ENV is None:
67	ENV = GpuEnvironment()
68	return ENV
69
70	def has_double(device):
71	"""
72	Return true if device supports double precision.
73	"""
74	return "cl_khr_fp64" in device.extensions
75
76	def get_warp(kernel, queue):
77	"""
78	Return the size of an execution batch for kernel running on queue.
79	"""
80	return kernel.get_work_group_info(
81	cl.kernel_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
82	queue.device)
83
84	def _stretch_input(vector, dtype, extra=1e-3, boundary=32):
85	"""
86	Stretch an input vector to the correct boundary.
87
88	Performance on the kernels can drop by a factor of two or more if the
89	number of values to compute does not fall on a nice power of two
90	boundary. The trailing additional vector elements are given a
91	value of extra, and so f(extra) will be computed for each of
92	them. The returned array will thus be a subset of the computed array.
93
94	boundary should be a power of 2 which is at least 32 for good
95	performance on current platforms (as of Jan 2015). It should
96	probably be the max of get_warp(kernel,queue) and
97	device.min_data_type_align_size//4.
98	"""
99	remainder = vector.size % boundary
100	if remainder != 0:
101	size = vector.size + (boundary - remainder)
102	vector = np.hstack((vector, [extra] * (size - vector.size)))
103	return np.ascontiguousarray(vector, dtype=dtype)
104
105
106	def compile_model(context, source, dtype):
107	"""
108	Build a model to run on the gpu.
109
110	Returns the compiled program and its type. The returned type will
111	be float32 even if the desired type is float64 if any of the
112	devices in the context do not support the cl_khr_fp64 extension.
113	"""
114	dtype = np.dtype(dtype)
115	if dtype == generate.F64 and not all(has_double(d) for d in context.devices):
116	raise RuntimeError("Double precision not supported for devices")
117
118	header = F64_DEFS if dtype == generate.F64 else ""
119	if dtype == generate.F32:
120	source = generate.use_single(source)
121	# Note: USE_SINCOS makes the intel cpu slower under opencl
122	if context.devices[0].type == cl.device_type.GPU:
123	header += "#define USE_SINCOS\n"
124	program = cl.Program(context, header + source).build()
125	return program
126
127
128	def make_result(self, size):
129	self.res = np.empty(size, dtype=self.dtype)
130	self.res_b = cl.Buffer(self.program.context, mf.READ_WRITE, self.res.nbytes)
131	return self.res, self.res_b
132
133
134	# for now, this returns one device in the context
135	# TODO: create a context that contains all devices on all platforms
136	class GpuEnvironment(object):
137	"""
138	GPU context, with possibly many devices, and one queue per device.
139	"""
140	def __init__(self):
141	# find gpu context
142	#self.context = cl.create_some_context()
143
144	self.context = None
145	if 'PYOPENCL_CTX' in os.environ:
146	self._create_some_context()
147
148	if not self.context:
149	self.context = _get_default_context()
150
151	# Byte boundary for data alignment
152	#self.data_boundary = max(d.min_data_type_align_size
153	# for d in self.context.devices)
154	self.queues = [cl.CommandQueue(self.context, d)
155	for d in self.context.devices]
156	self.has_double = all(has_double(d) for d in self.context.devices)
157	self.compiled = {}
158
159	def _create_some_context(self):
160	try:
161	self.context = cl.create_some_context(interactive=False)
162	except Exception, exc:
163	warnings.warn(str(exc))
164	warnings.warn("pyopencl.create_some_context() failed")
165	warnings.warn("the environment variable 'PYOPENCL_CTX' might not be set correctly")
166
167	def compile_program(self, name, source, dtype):
168	if name not in self.compiled:
169	#print "compiling",name
170	self.compiled[name] = compile_model(self.context, source, dtype)
171	return self.compiled[name]
172
173	def release_program(self, name):
174	if name in self.compiled:
175	self.compiled[name].release()
176	del self.compiled[name]
177
178	def _get_default_context():
179	default = None
180	for platform in cl.get_platforms():
181	for device in platform.get_devices():
182	if device.type == cl.device_type.GPU:
183	return cl.Context([device])
184	if default is None:
185	default = device
186
187	if not default:
188	raise RuntimeError("OpenCL device not found")
189
190	return cl.Context([default])
191
192
193	class GpuModel(object):
194	"""
195	GPU wrapper for a single model.
196
197	source and info are the model source and interface as returned
198	from :func:`gen.make`.
199
200	dtype is the desired model precision. Any numpy dtype for single
201	or double precision floats will do, such as 'f', 'float32' or 'single'
202	for single and 'd', 'float64' or 'double' for double. Double precision
203	is an optional extension which may not be available on all devices.
204	"""
205	def __init__(self, source, info, dtype=generate.F32):
206	self.info = info
207	self.source = source
208	self.dtype = np.dtype(dtype)
209	self.program = None # delay program creation
210
211	def __getstate__(self):
212	state = self.__dict__.copy()
213	state['program'] = None
214	return state
215
216	def __setstate__(self, state):
217	self.__dict__ = state.copy()
218
219	def __call__(self, q_input):
220	if self.dtype != q_input.dtype:
221	raise TypeError("data is %s kernel is %s" % (q_input.dtype, self.dtype))
222	if self.program is None:
223	compiler = environment().compile_program
224	self.program = compiler(self.info['name'], self.source, self.dtype)
225	kernel_name = generate.kernel_name(self.info, q_input.is_2D)
226	kernel = getattr(self.program, kernel_name)
227	return GpuKernel(kernel, self.info, q_input)
228
229	def release(self):
230	if self.program is not None:
231	environment().release_program(self.info['name'])
232	self.program = None
233
234	def make_input(self, q_vectors):
235	"""
236	Make q input vectors available to the model.
237
238	Note that each model needs its own q vector even if the case of
239	mixture models because some models may be OpenCL, some may be
240	ctypes and some may be pure python.
241	"""
242	return GpuInput(q_vectors, dtype=self.dtype)
243
244	# TODO: check that we don't need a destructor for buffers which go out of scope
245	class GpuInput(object):
246	"""
247	Make q data available to the gpu.
248
249	q_vectors is a list of q vectors, which will be [q] for 1-D data,
250	and [qx, qy] for 2-D data. Internally, the vectors will be reallocated
251	to get the best performance on OpenCL, which may involve shifting and
252	stretching the array to better match the memory architecture. Additional
253	points will be evaluated with q=1e-3.
254
255	dtype is the data type for the q vectors. The data type should be
256	set to match that of the kernel, which is an attribute of
257	:class:`GpuProgram`. Note that not all kernels support double
258	precision, so even if the program was created for double precision,
259	the GpuProgram.dtype may be single precision.
260
261	Call :meth:`release` when complete. Even if not called directly, the
262	buffer will be released when the data object is freed.
263	"""
264	def __init__(self, q_vectors, dtype=generate.F32):
265	env = environment()
266	self.nq = q_vectors[0].size
267	self.dtype = np.dtype(dtype)
268	self.is_2D = (len(q_vectors) == 2)
269	# TODO: stretch input based on get_warp()
270	# not doing it now since warp depends on kernel, which is not known
271	# at this point, so instead using 32, which is good on the set of
272	# architectures tested so far.
273	self.q_vectors = [_stretch_input(q, self.dtype, 32) for q in q_vectors]
274	self.q_buffers = [
275	cl.Buffer(env.context, mf.READ_ONLY \| mf.COPY_HOST_PTR, hostbuf=q)
276	for q in self.q_vectors
277	]
278	self.global_size = [self.q_vectors[0].size]
279
280	def release(self):
281	for b in self.q_buffers:
282	b.release()
283	self.q_buffers = []
284
285	class GpuKernel(object):
286	"""
287	Callable SAS kernel.
288
289	kernel is the GpuKernel object to call.
290
291	info is the module information
292
293	q_input is the DllInput q vectors at which the kernel should be
294	evaluated.
295
296	The resulting call method takes the pars, a list of values for
297	the fixed parameters to the kernel, and pd_pars, a list of (value,weight)
298	vectors for the polydisperse parameters. cutoff determines the
299	integration limits: any points with combined weight less than cutoff
300	will not be calculated.
301
302	Call :meth:`release` when done with the kernel instance.
303	"""
304	def __init__(self, kernel, info, q_input):
305	self.q_input = q_input
306	self.kernel = kernel
307	self.info = info
308	self.res = np.empty(q_input.nq, q_input.dtype)
309	dim = '2d' if q_input.is_2D else '1d'
310	self.fixed_pars = info['partype']['fixed-' + dim]
311	self.pd_pars = info['partype']['pd-' + dim]
312
313	# Inputs and outputs for each kernel call
314	# Note: res may be shorter than res_b if global_size != nq
315	env = environment()
316	self.loops_b = [cl.Buffer(env.context, mf.READ_WRITE,
317	2 * MAX_LOOPS * q_input.dtype.itemsize)
318	for _ in env.queues]
319	self.res_b = [cl.Buffer(env.context, mf.READ_WRITE,
320	q_input.global_size[0] * q_input.dtype.itemsize)
321	for _ in env.queues]
322
323
324	def __call__(self, fixed_pars, pd_pars, cutoff=1e-5):
325	real = np.float32 if self.q_input.dtype == generate.F32 else np.float64
326
327	device_num = 0
328	queuei = environment().queues[device_num]
329	res_bi = self.res_b[device_num]
330	nq = np.uint32(self.q_input.nq)
331	if pd_pars:
332	cutoff = real(cutoff)
333	loops_N = [np.uint32(len(p[0])) for p in pd_pars]
334	loops = np.hstack(pd_pars) \
335	if pd_pars else np.empty(0, dtype=self.q_input.dtype)
336	loops = np.ascontiguousarray(loops.T, self.q_input.dtype).flatten()
337	#print "loops",Nloops, loops
338
339	#import sys; print >>sys.stderr,"opencl eval",pars
340	#print "opencl eval",pars
341	if len(loops) > 2 * MAX_LOOPS:
342	raise ValueError("too many polydispersity points")
343
344	loops_bi = self.loops_b[device_num]
345	cl.enqueue_copy(queuei, loops_bi, loops)
346	loops_l = cl.LocalMemory(len(loops.data))
347	#ctx = environment().context
348	#loops_bi = cl.Buffer(ctx, mf.READ_ONLY\|mf.COPY_HOST_PTR, hostbuf=loops)
349	dispersed = [loops_bi, loops_l, cutoff] + loops_N
350	else:
351	dispersed = []
352	fixed = [real(p) for p in fixed_pars]
353	args = self.q_input.q_buffers + [res_bi, nq] + dispersed + fixed
354	self.kernel(queuei, self.q_input.global_size, None, *args)
355	cl.enqueue_copy(queuei, self.res, res_bi)
356
357	return self.res
358
359	def release(self):
360	for b in self.loops_b:
361	b.release()
362	self.loops_b = []
363	for b in self.res_b:
364	b.release()
365	self.res_b = []
366
367	def __del__(self):
368	self.release()

Note: See TracBrowser for help on using the repository browser.

SasView

source: sasmodels/sasmodels/kernelcl.py @ af1d68c

Download in other formats: