kernelcl.py @ 9404dd3

core_shell_microgelscostrafo411magnetic_modelrelease_v0.94release_v0.95ticket-1257-vesicle-productticket_1156ticket_1265_superballticket_822_more_unit_tests

Last change on this file since 9404dd3 was 9404dd3, checked in by Paul Kienzle <pkienzle@…>, 8 years ago
python 3.x support
Property mode set to `100644`
File size: 14.5 KB

Line
1	"""
2	GPU support through OpenCL
3
4	There should be a single GPU environment running on the system. This
5	environment is constructed on the first call to :func:`env`, and the
6	same environment is returned on each call.
7
8	After retrieving the environment, the next step is to create the kernel.
9	This is done with a call to :meth:`GpuEnvironment.make_kernel`, which
10	returns the type of data used by the kernel.
11
12	Next a :class:`GpuData` object should be created with the correct kind
13	of data. This data object can be used by multiple kernels, for example,
14	if the target model is a weighted sum of multiple kernels. The data
15	should include any extra evaluation points required to compute the proper
16	data smearing. This need not match the square grid for 2D data if there
17	is an index saying which q points are active.
18
19	Together the GpuData, the program, and a device form a :class:`GpuKernel`.
20	This kernel is used during fitting, receiving new sets of parameters and
21	evaluating them. The output value is stored in an output buffer on the
22	devices, where it can be combined with other structure factors and form
23	factors and have instrumental resolution effects applied.
24
25	In order to use OpenCL for your models, you will need OpenCL drivers for
26	your machine. These should be available from your graphics card vendor.
27	Intel provides OpenCL drivers for CPUs as well as their integrated HD
28	graphics chipsets. AMD also provides drivers for Intel CPUs, but as of
29	this writing the performance is lacking compared to the Intel drivers.
30	NVidia combines drivers for CUDA and OpenCL in one package. The result
31	is a bit messy if you have multiple drivers installed. You can see which
32	drivers are available by starting python and running:
33
34	import pyopencl as cl
35	cl.create_some_context(interactive=True)
36
37	Once you have done that, it will show the available drivers which you
38	can select. It will then tell you that you can use these drivers
39	automatically by setting the PYOPENCL_CTX environment variable.
40
41	Some graphics cards have multiple devices on the same card. You cannot
42	yet use both of them concurrently to evaluate models, but you can run
43	the program twice using a different device for each session.
44
45	OpenCL kernels are compiled when needed by the device driver. Some
46	drivers produce compiler output even when there is no error. You
47	can see the output by setting PYOPENCL_COMPILER_OUTPUT=1. It should be
48	harmless, albeit annoying.
49	"""
50	import os
51	import warnings
52
53	import numpy as np
54
55	try:
56	import pyopencl as cl
57	# Ask OpenCL for the default context so that we know that one exists
58	cl.create_some_context(interactive=False)
59	except Exception as exc:
60	warnings.warn(str(exc))
61	raise RuntimeError("OpenCL not available")
62
63	from pyopencl import mem_flags as mf
64
65	from . import generate
66
67	F64_DEFS = """\
68	#ifdef cl_khr_fp64
69	# pragma OPENCL EXTENSION cl_khr_fp64: enable
70	#endif
71	"""
72
73	# The max loops number is limited by the amount of local memory available
74	# on the device. You don't want to make this value too big because it will
75	# waste resources, nor too small because it may interfere with users trying
76	# to do their polydispersity calculations. A value of 1024 should be much
77	# larger than necessary given that cost grows as npts^k where k is the number
78	# of polydisperse parameters.
79	MAX_LOOPS = 2048
80
81
82	ENV = None
83	def environment():
84	"""
85	Returns a singleton :class:`GpuEnvironment`.
86
87	This provides an OpenCL context and one queue per device.
88	"""
89	global ENV
90	if ENV is None:
91	ENV = GpuEnvironment()
92	return ENV
93
94	def has_double(device):
95	"""
96	Return true if device supports double precision.
97	"""
98	return "cl_khr_fp64" in device.extensions
99
100	def get_warp(kernel, queue):
101	"""
102	Return the size of an execution batch for kernel running on queue.
103	"""
104	return kernel.get_work_group_info(
105	cl.kernel_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
106	queue.device)
107
108	def _stretch_input(vector, dtype, extra=1e-3, boundary=32):
109	"""
110	Stretch an input vector to the correct boundary.
111
112	Performance on the kernels can drop by a factor of two or more if the
113	number of values to compute does not fall on a nice power of two
114	boundary. The trailing additional vector elements are given a
115	value of extra, and so f(extra) will be computed for each of
116	them. The returned array will thus be a subset of the computed array.
117
118	boundary should be a power of 2 which is at least 32 for good
119	performance on current platforms (as of Jan 2015). It should
120	probably be the max of get_warp(kernel,queue) and
121	device.min_data_type_align_size//4.
122	"""
123	remainder = vector.size % boundary
124	if remainder != 0:
125	size = vector.size + (boundary - remainder)
126	vector = np.hstack((vector, [extra] * (size - vector.size)))
127	return np.ascontiguousarray(vector, dtype=dtype)
128
129
130	def compile_model(context, source, dtype):
131	"""
132	Build a model to run on the gpu.
133
134	Returns the compiled program and its type. The returned type will
135	be float32 even if the desired type is float64 if any of the
136	devices in the context do not support the cl_khr_fp64 extension.
137	"""
138	dtype = np.dtype(dtype)
139	if dtype == generate.F64 and not all(has_double(d) for d in context.devices):
140	raise RuntimeError("Double precision not supported for devices")
141
142	header = F64_DEFS if dtype == generate.F64 else ""
143	if dtype == generate.F32:
144	source = generate.use_single(source)
145	# Note: USE_SINCOS makes the intel cpu slower under opencl
146	if context.devices[0].type == cl.device_type.GPU:
147	header += "#define USE_SINCOS\n"
148	program = cl.Program(context, header + source).build()
149	return program
150
151
152	def make_result(self, size):
153	self.res = np.empty(size, dtype=self.dtype)
154	self.res_b = cl.Buffer(self.program.context, mf.READ_WRITE, self.res.nbytes)
155	return self.res, self.res_b
156
157
158	# for now, this returns one device in the context
159	# TODO: create a context that contains all devices on all platforms
160	class GpuEnvironment(object):
161	"""
162	GPU context, with possibly many devices, and one queue per device.
163	"""
164	def __init__(self):
165	# find gpu context
166	#self.context = cl.create_some_context()
167
168	self.context = None
169	if 'PYOPENCL_CTX' in os.environ:
170	self._create_some_context()
171
172	if not self.context:
173	self.context = _get_default_context()
174
175	# Byte boundary for data alignment
176	#self.data_boundary = max(d.min_data_type_align_size
177	# for d in self.context.devices)
178	self.queues = [cl.CommandQueue(self.context, d)
179	for d in self.context.devices]
180	self.has_double = all(has_double(d) for d in self.context.devices)
181	self.compiled = {}
182
183	def _create_some_context(self):
184	try:
185	self.context = cl.create_some_context(interactive=False)
186	except Exception as exc:
187	warnings.warn(str(exc))
188	warnings.warn("pyopencl.create_some_context() failed")
189	warnings.warn("the environment variable 'PYOPENCL_CTX' might not be set correctly")
190
191	def compile_program(self, name, source, dtype):
192	if name not in self.compiled:
193	#print("compiling",name)
194	self.compiled[name] = compile_model(self.context, source, dtype)
195	return self.compiled[name]
196
197	def release_program(self, name):
198	if name in self.compiled:
199	self.compiled[name].release()
200	del self.compiled[name]
201
202	def _get_default_context():
203	default = None
204	for platform in cl.get_platforms():
205	for device in platform.get_devices():
206	if device.type == cl.device_type.GPU:
207	return cl.Context([device])
208	if default is None:
209	default = device
210
211	if not default:
212	raise RuntimeError("OpenCL device not found")
213
214	return cl.Context([default])
215
216
217	class GpuModel(object):
218	"""
219	GPU wrapper for a single model.
220
221	source and info are the model source and interface as returned
222	from :func:`gen.make`.
223
224	dtype is the desired model precision. Any numpy dtype for single
225	or double precision floats will do, such as 'f', 'float32' or 'single'
226	for single and 'd', 'float64' or 'double' for double. Double precision
227	is an optional extension which may not be available on all devices.
228	"""
229	def __init__(self, source, info, dtype=generate.F32):
230	self.info = info
231	self.source = source
232	self.dtype = np.dtype(dtype)
233	self.program = None # delay program creation
234
235	def __getstate__(self):
236	state = self.__dict__.copy()
237	state['program'] = None
238	return state
239
240	def __setstate__(self, state):
241	self.__dict__ = state.copy()
242
243	def __call__(self, q_input):
244	if self.dtype != q_input.dtype:
245	raise TypeError("data is %s kernel is %s" % (q_input.dtype, self.dtype))
246	if self.program is None:
247	compiler = environment().compile_program
248	self.program = compiler(self.info['name'], self.source, self.dtype)
249	kernel_name = generate.kernel_name(self.info, q_input.is_2D)
250	kernel = getattr(self.program, kernel_name)
251	return GpuKernel(kernel, self.info, q_input)
252
253	def release(self):
254	if self.program is not None:
255	environment().release_program(self.info['name'])
256	self.program = None
257
258	def make_input(self, q_vectors):
259	"""
260	Make q input vectors available to the model.
261
262	Note that each model needs its own q vector even if the case of
263	mixture models because some models may be OpenCL, some may be
264	ctypes and some may be pure python.
265	"""
266	return GpuInput(q_vectors, dtype=self.dtype)
267
268	# TODO: check that we don't need a destructor for buffers which go out of scope
269	class GpuInput(object):
270	"""
271	Make q data available to the gpu.
272
273	q_vectors is a list of q vectors, which will be [q] for 1-D data,
274	and [qx, qy] for 2-D data. Internally, the vectors will be reallocated
275	to get the best performance on OpenCL, which may involve shifting and
276	stretching the array to better match the memory architecture. Additional
277	points will be evaluated with q=1e-3.
278
279	dtype is the data type for the q vectors. The data type should be
280	set to match that of the kernel, which is an attribute of
281	:class:`GpuProgram`. Note that not all kernels support double
282	precision, so even if the program was created for double precision,
283	the GpuProgram.dtype may be single precision.
284
285	Call :meth:`release` when complete. Even if not called directly, the
286	buffer will be released when the data object is freed.
287	"""
288	def __init__(self, q_vectors, dtype=generate.F32):
289	env = environment()
290	self.nq = q_vectors[0].size
291	self.dtype = np.dtype(dtype)
292	self.is_2D = (len(q_vectors) == 2)
293	# TODO: stretch input based on get_warp()
294	# not doing it now since warp depends on kernel, which is not known
295	# at this point, so instead using 32, which is good on the set of
296	# architectures tested so far.
297	self.q_vectors = [_stretch_input(q, self.dtype, 32) for q in q_vectors]
298	self.q_buffers = [
299	cl.Buffer(env.context, mf.READ_ONLY \| mf.COPY_HOST_PTR, hostbuf=q)
300	for q in self.q_vectors
301	]
302	self.global_size = [self.q_vectors[0].size]
303
304	def release(self):
305	for b in self.q_buffers:
306	b.release()
307	self.q_buffers = []
308
309	class GpuKernel(object):
310	"""
311	Callable SAS kernel.
312
313	kernel is the GpuKernel object to call.
314
315	info is the module information
316
317	q_input is the DllInput q vectors at which the kernel should be
318	evaluated.
319
320	The resulting call method takes the pars, a list of values for
321	the fixed parameters to the kernel, and pd_pars, a list of (value,weight)
322	vectors for the polydisperse parameters. cutoff determines the
323	integration limits: any points with combined weight less than cutoff
324	will not be calculated.
325
326	Call :meth:`release` when done with the kernel instance.
327	"""
328	def __init__(self, kernel, info, q_input):
329	self.q_input = q_input
330	self.kernel = kernel
331	self.info = info
332	self.res = np.empty(q_input.nq, q_input.dtype)
333	dim = '2d' if q_input.is_2D else '1d'
334	self.fixed_pars = info['partype']['fixed-' + dim]
335	self.pd_pars = info['partype']['pd-' + dim]
336
337	# Inputs and outputs for each kernel call
338	# Note: res may be shorter than res_b if global_size != nq
339	env = environment()
340	self.loops_b = [cl.Buffer(env.context, mf.READ_WRITE,
341	2 * MAX_LOOPS * q_input.dtype.itemsize)
342	for _ in env.queues]
343	self.res_b = [cl.Buffer(env.context, mf.READ_WRITE,
344	q_input.global_size[0] * q_input.dtype.itemsize)
345	for _ in env.queues]
346
347
348	def __call__(self, fixed_pars, pd_pars, cutoff=1e-5):
349	real = np.float32 if self.q_input.dtype == generate.F32 else np.float64
350
351	device_num = 0
352	queuei = environment().queues[device_num]
353	res_bi = self.res_b[device_num]
354	nq = np.uint32(self.q_input.nq)
355	if pd_pars:
356	cutoff = real(cutoff)
357	loops_N = [np.uint32(len(p[0])) for p in pd_pars]
358	loops = np.hstack(pd_pars) \
359	if pd_pars else np.empty(0, dtype=self.q_input.dtype)
360	loops = np.ascontiguousarray(loops.T, self.q_input.dtype).flatten()
361	#print("loops",Nloops, loops)
362
363	#import sys; print("opencl eval",pars)
364	#print("opencl eval",pars)
365	if len(loops) > 2 * MAX_LOOPS:
366	raise ValueError("too many polydispersity points")
367
368	loops_bi = self.loops_b[device_num]
369	cl.enqueue_copy(queuei, loops_bi, loops)
370	loops_l = cl.LocalMemory(len(loops.data))
371	#ctx = environment().context
372	#loops_bi = cl.Buffer(ctx, mf.READ_ONLY\|mf.COPY_HOST_PTR, hostbuf=loops)
373	dispersed = [loops_bi, loops_l, cutoff] + loops_N
374	else:
375	dispersed = []
376	fixed = [real(p) for p in fixed_pars]
377	args = self.q_input.q_buffers + [res_bi, nq] + dispersed + fixed
378	self.kernel(queuei, self.q_input.global_size, None, *args)
379	cl.enqueue_copy(queuei, self.res, res_bi)
380
381	return self.res
382
383	def release(self):
384	for b in self.loops_b:
385	b.release()
386	self.loops_b = []
387	for b in self.res_b:
388	b.release()
389	self.res_b = []
390
391	def __del__(self):
392	self.release()

Note: See TracBrowser for help on using the repository browser.

SasView

source: sasmodels/sasmodels/kernelcl.py @ 9404dd3

Download in other formats: