kernelcl.py @ 0763009

core_shell_microgelscostrafo411magnetic_modelrelease_v0.94release_v0.95ticket-1257-vesicle-productticket_1156ticket_1265_superballticket_822_more_unit_tests

Last change on this file since 0763009 was 0763009, checked in by Paul Kienzle <pkienzle@…>, 8 years ago
minor code cleanup
Property mode set to `100644`
File size: 13.3 KB

Line
1	"""
2	GPU support through OpenCL
3
4	There should be a single GPU environment running on the system. This
5	environment is constructed on the first call to :func:`env`, and the
6	same environment is returned on each call.
7
8	After retrieving the environment, the next step is to create the kernel.
9	This is done with a call to :meth:`GpuEnvironment.make_kernel`, which
10	returns the type of data used by the kernel.
11
12	Next a :class:`GpuData` object should be created with the correct kind
13	of data. This data object can be used by multiple kernels, for example,
14	if the target model is a weighted sum of multiple kernels. The data
15	should include any extra evaluation points required to compute the proper
16	data smearing. This need not match the square grid for 2D data if there
17	is an index saying which q points are active.
18
19	Together the GpuData, the program, and a device form a :class:`GpuKernel`.
20	This kernel is used during fitting, receiving new sets of parameters and
21	evaluating them. The output value is stored in an output buffer on the
22	devices, where it can be combined with other structure factors and form
23	factors and have instrumental resolution effects applied.
24	"""
25	import os
26	import warnings
27
28	import numpy as np
29
30	try:
31	import pyopencl as cl
32	# Ask OpenCL for the default context so that we know that one exists
33	cl.create_some_context(interactive=False)
34	except Exception, exc:
35	warnings.warn(str(exc))
36	raise RuntimeError("OpenCL not available")
37
38	from pyopencl import mem_flags as mf
39
40	from . import generate
41
42	F64_DEFS = """\
43	#ifdef cl_khr_fp64
44	# pragma OPENCL EXTENSION cl_khr_fp64: enable
45	#endif
46	"""
47
48	# The max loops number is limited by the amount of local memory available
49	# on the device. You don't want to make this value too big because it will
50	# waste resources, nor too small because it may interfere with users trying
51	# to do their polydispersity calculations. A value of 1024 should be much
52	# larger than necessary given that cost grows as npts^k where k is the number
53	# of polydisperse parameters.
54	MAX_LOOPS = 2048
55
56
57	ENV = None
58	def environment():
59	"""
60	Returns a singleton :class:`GpuEnvironment`.
61
62	This provides an OpenCL context and one queue per device.
63	"""
64	global ENV
65	if ENV is None:
66	ENV = GpuEnvironment()
67	return ENV
68
69	def has_double(device):
70	"""
71	Return true if device supports double precision.
72	"""
73	return "cl_khr_fp64" in device.extensions
74
75	def get_warp(kernel, queue):
76	"""
77	Return the size of an execution batch for kernel running on queue.
78	"""
79	return kernel.get_work_group_info(
80	cl.kernel_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
81	queue.device)
82
83	def _stretch_input(vector, dtype, extra=1e-3, boundary=32):
84	"""
85	Stretch an input vector to the correct boundary.
86
87	Performance on the kernels can drop by a factor of two or more if the
88	number of values to compute does not fall on a nice power of two
89	boundary. The trailing additional vector elements are given a
90	value of extra, and so f(extra) will be computed for each of
91	them. The returned array will thus be a subset of the computed array.
92
93	boundary should be a power of 2 which is at least 32 for good
94	performance on current platforms (as of Jan 2015). It should
95	probably be the max of get_warp(kernel,queue) and
96	device.min_data_type_align_size//4.
97	"""
98	remainder = vector.size % boundary
99	if remainder != 0:
100	size = vector.size + (boundary - remainder)
101	vector = np.hstack((vector, [extra] * (size - vector.size)))
102	return np.ascontiguousarray(vector, dtype=dtype)
103
104
105	def compile_model(context, source, dtype):
106	"""
107	Build a model to run on the gpu.
108
109	Returns the compiled program and its type. The returned type will
110	be float32 even if the desired type is float64 if any of the
111	devices in the context do not support the cl_khr_fp64 extension.
112	"""
113	dtype = np.dtype(dtype)
114	if dtype == generate.F64 and not all(has_double(d) for d in context.devices):
115	raise RuntimeError("Double precision not supported for devices")
116
117	header = F64_DEFS if dtype == generate.F64 else ""
118	if dtype == generate.F32:
119	source = generate.use_single(source)
120	# Note: USE_SINCOS makes the intel cpu slower under opencl
121	if context.devices[0].type == cl.device_type.GPU:
122	header += "#define USE_SINCOS\n"
123	program = cl.Program(context, header + source).build()
124	return program
125
126
127	def make_result(self, size):
128	self.res = np.empty(size, dtype=self.dtype)
129	self.res_b = cl.Buffer(self.program.context, mf.READ_WRITE, self.res.nbytes)
130	return self.res, self.res_b
131
132
133	# for now, this returns one device in the context
134	# TODO: create a context that contains all devices on all platforms
135	class GpuEnvironment(object):
136	"""
137	GPU context, with possibly many devices, and one queue per device.
138	"""
139	def __init__(self):
140	# find gpu context
141	#self.context = cl.create_some_context()
142
143	self.context = None
144	if 'PYOPENCL_CTX' in os.environ:
145	self._create_some_context()
146
147	if not self.context:
148	self.context = _get_default_context()
149
150	# Byte boundary for data alignment
151	#self.data_boundary = max(d.min_data_type_align_size
152	# for d in self.context.devices)
153	self.queues = [cl.CommandQueue(self.context, d)
154	for d in self.context.devices]
155	self.has_double = all(has_double(d) for d in self.context.devices)
156	self.compiled = {}
157
158	def _create_some_context(self):
159	try:
160	self.context = cl.create_some_context(interactive=False)
161	except Exception, exc:
162	warnings.warn(str(exc))
163	warnings.warn("pyopencl.create_some_context() failed")
164	warnings.warn("the environment variable 'PYOPENCL_CTX' might not be set correctly")
165
166	def compile_program(self, name, source, dtype):
167	if name not in self.compiled:
168	#print "compiling",name
169	self.compiled[name] = compile_model(self.context, source, dtype)
170	return self.compiled[name]
171
172	def release_program(self, name):
173	if name in self.compiled:
174	self.compiled[name].release()
175	del self.compiled[name]
176
177	def _get_default_context():
178	default = None
179	for platform in cl.get_platforms():
180	for device in platform.get_devices():
181	if device.type == cl.device_type.GPU:
182	return cl.Context([device])
183	if default is None:
184	default = device
185
186	if not default:
187	raise RuntimeError("OpenCL device not found")
188
189	return cl.Context([default])
190
191
192	class GpuModel(object):
193	"""
194	GPU wrapper for a single model.
195
196	source and info are the model source and interface as returned
197	from :func:`gen.make`.
198
199	dtype is the desired model precision. Any numpy dtype for single
200	or double precision floats will do, such as 'f', 'float32' or 'single'
201	for single and 'd', 'float64' or 'double' for double. Double precision
202	is an optional extension which may not be available on all devices.
203	"""
204	def __init__(self, source, info, dtype=generate.F32):
205	self.info = info
206	self.source = source
207	self.dtype = np.dtype(dtype)
208	self.program = None # delay program creation
209
210	def __getstate__(self):
211	state = self.__dict__.copy()
212	state['program'] = None
213	return state
214
215	def __setstate__(self, state):
216	self.__dict__ = state.copy()
217
218	def __call__(self, q_input):
219	if self.dtype != q_input.dtype:
220	raise TypeError("data is %s kernel is %s" % (q_input.dtype, self.dtype))
221	if self.program is None:
222	compiler = environment().compile_program
223	self.program = compiler(self.info['name'], self.source, self.dtype)
224	kernel_name = generate.kernel_name(self.info, q_input.is_2D)
225	kernel = getattr(self.program, kernel_name)
226	return GpuKernel(kernel, self.info, q_input)
227
228	def release(self):
229	if self.program is not None:
230	environment().release_program(self.info['name'])
231	self.program = None
232
233	def make_input(self, q_vectors):
234	"""
235	Make q input vectors available to the model.
236
237	Note that each model needs its own q vector even if the case of
238	mixture models because some models may be OpenCL, some may be
239	ctypes and some may be pure python.
240	"""
241	return GpuInput(q_vectors, dtype=self.dtype)
242
243	# TODO: check that we don't need a destructor for buffers which go out of scope
244	class GpuInput(object):
245	"""
246	Make q data available to the gpu.
247
248	q_vectors is a list of q vectors, which will be [q] for 1-D data,
249	and [qx, qy] for 2-D data. Internally, the vectors will be reallocated
250	to get the best performance on OpenCL, which may involve shifting and
251	stretching the array to better match the memory architecture. Additional
252	points will be evaluated with q=1e-3.
253
254	dtype is the data type for the q vectors. The data type should be
255	set to match that of the kernel, which is an attribute of
256	:class:`GpuProgram`. Note that not all kernels support double
257	precision, so even if the program was created for double precision,
258	the GpuProgram.dtype may be single precision.
259
260	Call :meth:`release` when complete. Even if not called directly, the
261	buffer will be released when the data object is freed.
262	"""
263	def __init__(self, q_vectors, dtype=generate.F32):
264	env = environment()
265	self.nq = q_vectors[0].size
266	self.dtype = np.dtype(dtype)
267	self.is_2D = (len(q_vectors) == 2)
268	# TODO: stretch input based on get_warp()
269	# not doing it now since warp depends on kernel, which is not known
270	# at this point, so instead using 32, which is good on the set of
271	# architectures tested so far.
272	self.q_vectors = [_stretch_input(q, self.dtype, 32) for q in q_vectors]
273	self.q_buffers = [
274	cl.Buffer(env.context, mf.READ_ONLY \| mf.COPY_HOST_PTR, hostbuf=q)
275	for q in self.q_vectors
276	]
277	self.global_size = [self.q_vectors[0].size]
278
279	def release(self):
280	for b in self.q_buffers:
281	b.release()
282	self.q_buffers = []
283
284	class GpuKernel(object):
285	"""
286	Callable SAS kernel.
287
288	kernel is the GpuKernel object to call.
289
290	info is the module information
291
292	q_input is the DllInput q vectors at which the kernel should be
293	evaluated.
294
295	The resulting call method takes the pars, a list of values for
296	the fixed parameters to the kernel, and pd_pars, a list of (value,weight)
297	vectors for the polydisperse parameters. cutoff determines the
298	integration limits: any points with combined weight less than cutoff
299	will not be calculated.
300
301	Call :meth:`release` when done with the kernel instance.
302	"""
303	def __init__(self, kernel, info, q_input):
304	self.q_input = q_input
305	self.kernel = kernel
306	self.info = info
307	self.res = np.empty(q_input.nq, q_input.dtype)
308	dim = '2d' if q_input.is_2D else '1d'
309	self.fixed_pars = info['partype']['fixed-' + dim]
310	self.pd_pars = info['partype']['pd-' + dim]
311
312	# Inputs and outputs for each kernel call
313	# Note: res may be shorter than res_b if global_size != nq
314	env = environment()
315	self.loops_b = [cl.Buffer(env.context, mf.READ_WRITE,
316	2 * MAX_LOOPS * q_input.dtype.itemsize)
317	for _ in env.queues]
318	self.res_b = [cl.Buffer(env.context, mf.READ_WRITE,
319	q_input.global_size[0] * q_input.dtype.itemsize)
320	for _ in env.queues]
321
322
323	def __call__(self, fixed_pars, pd_pars, cutoff=1e-5):
324	real = np.float32 if self.q_input.dtype == generate.F32 else np.float64
325
326	device_num = 0
327	queuei = environment().queues[device_num]
328	res_bi = self.res_b[device_num]
329	nq = np.uint32(self.q_input.nq)
330	if pd_pars:
331	cutoff = real(cutoff)
332	loops_N = [np.uint32(len(p[0])) for p in pd_pars]
333	loops = np.hstack(pd_pars) \
334	if pd_pars else np.empty(0, dtype=self.q_input.dtype)
335	loops = np.ascontiguousarray(loops.T, self.q_input.dtype).flatten()
336	#print "loops",Nloops, loops
337
338	#import sys; print >>sys.stderr,"opencl eval",pars
339	#print "opencl eval",pars
340	if len(loops) > 2 * MAX_LOOPS:
341	raise ValueError("too many polydispersity points")
342
343	loops_bi = self.loops_b[device_num]
344	cl.enqueue_copy(queuei, loops_bi, loops)
345	loops_l = cl.LocalMemory(len(loops.data))
346	#ctx = environment().context
347	#loops_bi = cl.Buffer(ctx, mf.READ_ONLY\|mf.COPY_HOST_PTR, hostbuf=loops)
348	dispersed = [loops_bi, loops_l, cutoff] + loops_N
349	else:
350	dispersed = []
351	fixed = [real(p) for p in fixed_pars]
352	args = self.q_input.q_buffers + [res_bi, nq] + dispersed + fixed
353	self.kernel(queuei, self.q_input.global_size, None, *args)
354	cl.enqueue_copy(queuei, self.res, res_bi)
355
356	return self.res
357
358	def release(self):
359	for b in self.loops_b:
360	b.release()
361	self.loops_b = []
362	for b in self.res_b:
363	b.release()
364	self.res_b = []
365
366	def __del__(self):
367	self.release()

Note: See TracBrowser for help on using the repository browser.

SasView

source: sasmodels/sasmodels/kernelcl.py @ 0763009

Download in other formats: