kernelcl.py @ 33af590

core_shell_microgelscostrafo411magnetic_modelrelease_v0.94release_v0.95ticket-1257-vesicle-productticket_1156ticket_1265_superballticket_822_more_unit_tests

Last change on this file since 33af590 was 33af590, checked in by Paul Kienzle <pkienzle@…>, 8 years ago
monkey patch pyopencl so that include path is quoted appropriately
Property mode set to `100644`
File size: 19.3 KB

Line
1	"""
2	GPU driver for C kernels
3
4	There should be a single GPU environment running on the system. This
5	environment is constructed on the first call to :func:`env`, and the
6	same environment is returned on each call.
7
8	After retrieving the environment, the next step is to create the kernel.
9	This is done with a call to :meth:`GpuEnvironment.make_kernel`, which
10	returns the type of data used by the kernel.
11
12	Next a :class:`GpuData` object should be created with the correct kind
13	of data. This data object can be used by multiple kernels, for example,
14	if the target model is a weighted sum of multiple kernels. The data
15	should include any extra evaluation points required to compute the proper
16	data smearing. This need not match the square grid for 2D data if there
17	is an index saying which q points are active.
18
19	Together the GpuData, the program, and a device form a :class:`GpuKernel`.
20	This kernel is used during fitting, receiving new sets of parameters and
21	evaluating them. The output value is stored in an output buffer on the
22	devices, where it can be combined with other structure factors and form
23	factors and have instrumental resolution effects applied.
24
25	In order to use OpenCL for your models, you will need OpenCL drivers for
26	your machine. These should be available from your graphics card vendor.
27	Intel provides OpenCL drivers for CPUs as well as their integrated HD
28	graphics chipsets. AMD also provides drivers for Intel CPUs, but as of
29	this writing the performance is lacking compared to the Intel drivers.
30	NVidia combines drivers for CUDA and OpenCL in one package. The result
31	is a bit messy if you have multiple drivers installed. You can see which
32	drivers are available by starting python and running:
33
34	import pyopencl as cl
35	cl.create_some_context(interactive=True)
36
37	Once you have done that, it will show the available drivers which you
38	can select. It will then tell you that you can use these drivers
39	automatically by setting the PYOPENCL_CTX environment variable.
40
41	Some graphics cards have multiple devices on the same card. You cannot
42	yet use both of them concurrently to evaluate models, but you can run
43	the program twice using a different device for each session.
44
45	OpenCL kernels are compiled when needed by the device driver. Some
46	drivers produce compiler output even when there is no error. You
47	can see the output by setting PYOPENCL_COMPILER_OUTPUT=1. It should be
48	harmless, albeit annoying.
49	"""
50	import os
51	import warnings
52	import logging
53
54	import numpy as np
55
56	try:
57	import pyopencl as cl
58	# Ask OpenCL for the default context so that we know that one exists
59	cl.create_some_context(interactive=False)
60	except Exception as exc:
61	warnings.warn(str(exc))
62	raise RuntimeError("OpenCL not available")
63
64	# CRUFT: pyopencl < 2017.1 (as of June 2016 needs quotes around include path)
65	def _quote_path(v):
66	"""
67	Quote the path if it is not already quoted.
68
69	If v starts with '-', then assume that it is a -I option or similar
70	and do not quote it. This is fragile: -Ipath with space needs to
71	be quoted.
72	"""
73	return '"'+v+'"' if v and ' ' in v and not v[0] in "\"'-" else v
74
75	if hasattr(cl, '_DEFAULT_INCLUDE_OPTIONS'):
76	cl._DEFAULT_INCLUDE_OPTIONS = [_quote_path(v) for v in cl._DEFAULT_INCLUDE_OPTIONS]
77
78	from pyopencl import mem_flags as mf
79	from pyopencl.characterize import get_fast_inaccurate_build_options
80
81	from . import generate
82
83	# The max loops number is limited by the amount of local memory available
84	# on the device. You don't want to make this value too big because it will
85	# waste resources, nor too small because it may interfere with users trying
86	# to do their polydispersity calculations. A value of 1024 should be much
87	# larger than necessary given that cost grows as npts^k where k is the number
88	# of polydisperse parameters.
89	MAX_LOOPS = 2048
90
91
92	ENV = None
93	def environment():
94	"""
95	Returns a singleton :class:`GpuEnvironment`.
96
97	This provides an OpenCL context and one queue per device.
98	"""
99	global ENV
100	if ENV is None:
101	ENV = GpuEnvironment()
102	return ENV
103
104	def has_type(device, dtype):
105	"""
106	Return true if device supports the requested precision.
107	"""
108	if dtype == generate.F32:
109	return True
110	elif dtype == generate.F64:
111	return "cl_khr_fp64" in device.extensions
112	elif dtype == generate.F16:
113	return "cl_khr_fp16" in device.extensions
114	else:
115	return False
116
117	def get_warp(kernel, queue):
118	"""
119	Return the size of an execution batch for kernel running on queue.
120	"""
121	return kernel.get_work_group_info(
122	cl.kernel_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
123	queue.device)
124
125	def _stretch_input(vector, dtype, extra=1e-3, boundary=32):
126	"""
127	Stretch an input vector to the correct boundary.
128
129	Performance on the kernels can drop by a factor of two or more if the
130	number of values to compute does not fall on a nice power of two
131	boundary. The trailing additional vector elements are given a
132	value of extra, and so f(extra) will be computed for each of
133	them. The returned array will thus be a subset of the computed array.
134
135	boundary should be a power of 2 which is at least 32 for good
136	performance on current platforms (as of Jan 2015). It should
137	probably be the max of get_warp(kernel,queue) and
138	device.min_data_type_align_size//4.
139	"""
140	remainder = vector.size % boundary
141	if remainder != 0:
142	size = vector.size + (boundary - remainder)
143	vector = np.hstack((vector, [extra] * (size - vector.size)))
144	return np.ascontiguousarray(vector, dtype=dtype)
145
146
147	def compile_model(context, source, dtype, fast=False):
148	"""
149	Build a model to run on the gpu.
150
151	Returns the compiled program and its type. The returned type will
152	be float32 even if the desired type is float64 if any of the
153	devices in the context do not support the cl_khr_fp64 extension.
154	"""
155	dtype = np.dtype(dtype)
156	if not all(has_type(d, dtype) for d in context.devices):
157	raise RuntimeError("%s not supported for devices"%dtype)
158
159	source = generate.convert_type(source, dtype)
160	# Note: USE_SINCOS makes the intel cpu slower under opencl
161	if context.devices[0].type == cl.device_type.GPU:
162	source = "#define USE_SINCOS\n" + source
163	options = (get_fast_inaccurate_build_options(context.devices[0])
164	if fast else [])
165	program = cl.Program(context, source).build(options=options)
166	#print("done with "+program)
167	return program
168
169
170	# for now, this returns one device in the context
171	# TODO: create a context that contains all devices on all platforms
172	class GpuEnvironment(object):
173	"""
174	GPU context, with possibly many devices, and one queue per device.
175	"""
176	def __init__(self):
177	# find gpu context
178	#self.context = cl.create_some_context()
179
180	self.context = None
181	if 'PYOPENCL_CTX' in os.environ:
182	self._create_some_context()
183
184	if not self.context:
185	self.context = _get_default_context()
186
187	# Byte boundary for data alignment
188	#self.data_boundary = max(d.min_data_type_align_size
189	# for d in self.context.devices)
190	self.queues = [cl.CommandQueue(context, context.devices[0])
191	for context in self.context]
192	self.compiled = {}
193
194	def has_type(self, dtype):
195	"""
196	Return True if all devices support a given type.
197	"""
198	dtype = generate.F32 if dtype == 'fast' else np.dtype(dtype)
199	return any(has_type(d, dtype)
200	for context in self.context
201	for d in context.devices)
202
203	def get_queue(self, dtype):
204	"""
205	Return a command queue for the kernels of type dtype.
206	"""
207	for context, queue in zip(self.context, self.queues):
208	if all(has_type(d, dtype) for d in context.devices):
209	return queue
210
211	def get_context(self, dtype):
212	"""
213	Return a OpenCL context for the kernels of type dtype.
214	"""
215	for context, queue in zip(self.context, self.queues):
216	if all(has_type(d, dtype) for d in context.devices):
217	return context
218
219	def _create_some_context(self):
220	"""
221	Protected call to cl.create_some_context without interactivity. Use
222	this if PYOPENCL_CTX is set in the environment. Sets the context
223	attribute.
224	"""
225	try:
226	self.context = [cl.create_some_context(interactive=False)]
227	except Exception as exc:
228	warnings.warn(str(exc))
229	warnings.warn("pyopencl.create_some_context() failed")
230	warnings.warn("the environment variable 'PYOPENCL_CTX' might not be set correctly")
231
232	def compile_program(self, name, source, dtype, fast=False):
233	"""
234	Compile the program for the device in the given context.
235	"""
236	key = "%s-%s%s"%(name, dtype, ("-fast" if fast else ""))
237	if key not in self.compiled:
238	context = self.get_context(dtype)
239	logging.info("building %s for OpenCL %s"
240	% (key, context.devices[0].name.strip()))
241	program = compile_model(context, source, np.dtype(dtype), fast)
242	self.compiled[key] = program
243	return self.compiled[key]
244
245	def release_program(self, name):
246	"""
247	Free memory associated with the program on the device.
248	"""
249	if name in self.compiled:
250	self.compiled[name].release()
251	del self.compiled[name]
252
253	def _get_default_context():
254	"""
255	Get an OpenCL context, preferring GPU over CPU, and preferring Intel
256	drivers over AMD drivers.
257	"""
258	# Note: on mobile devices there is automatic clock scaling if either the
259	# CPU or the GPU is underutilized; probably doesn't affect us, but we if
260	# it did, it would mean that putting a busy loop on the CPU while the GPU
261	# is running may increase throughput.
262	#
263	# Macbook pro, base install:
264	# {'Apple': [Intel CPU, NVIDIA GPU]}
265	# Macbook pro, base install:
266	# {'Apple': [Intel CPU, Intel GPU]}
267	# 2 x nvidia 295 with Intel and NVIDIA opencl drivers installed
268	# {'Intel': [CPU], 'NVIDIA': [GPU, GPU, GPU, GPU]}
269	gpu, cpu = None, None
270	for platform in cl.get_platforms():
271	# AMD provides a much weaker CPU driver than Intel/Apple, so avoid it.
272	# If someone has bothered to install the AMD/NVIDIA drivers, prefer them over the integrated
273	# graphics driver that may have been supplied with the CPU chipset.
274	preferred_cpu = platform.vendor.startswith('Intel') or platform.vendor.startswith('Apple')
275	preferred_gpu = platform.vendor.startswith('Advanced') or platform.vendor.startswith('NVIDIA')
276	for device in platform.get_devices():
277	if device.type == cl.device_type.GPU:
278	# If the existing type is not GPU then it will be CUSTOM or ACCELERATOR,
279	# so don't override it.
280	if gpu is None or (preferred_gpu and gpu.type == cl.device_type.GPU):
281	gpu = device
282	elif device.type == cl.device_type.CPU:
283	if cpu is None or preferred_cpu:
284	cpu = device
285	else:
286	# System has cl.device_type.ACCELERATOR or cl.device_type.CUSTOM
287	# Intel Phi for example registers as an accelerator
288	# Since the user installed a custom device on their system and went through the
289	# pain of sorting out OpenCL drivers for it, lets assume they really do want to
290	# use it as their primary compute device.
291	gpu = device
292
293	# order the devices by gpu then by cpu; when searching for an available device by data type they
294	# will be checked in this order, which means that if the gpu supports double then the cpu will never
295	# be used (though we may make it possible to explicitly request the cpu at some point).
296	devices = []
297	if gpu is not None:
298	devices.append(gpu)
299	if cpu is not None:
300	devices.append(cpu)
301	return [cl.Context([d]) for d in devices]
302
303
304	class GpuModel(object):
305	"""
306	GPU wrapper for a single model.
307
308	source and model_info are the model source and interface as returned
309	from :func:`generate.make_source` and :func:`generate.make_model_info`.
310
311	dtype is the desired model precision. Any numpy dtype for single
312	or double precision floats will do, such as 'f', 'float32' or 'single'
313	for single and 'd', 'float64' or 'double' for double. Double precision
314	is an optional extension which may not be available on all devices.
315	Half precision ('float16','half') may be available on some devices.
316	Fast precision ('fast') is a loose version of single precision, indicating
317	that the compiler is allowed to take shortcuts.
318	"""
319	def __init__(self, source, model_info, dtype=generate.F32):
320	self.info = model_info
321	self.source = source
322	self.dtype = generate.F32 if dtype == 'fast' else np.dtype(dtype)
323	self.fast = (dtype == 'fast')
324	self.program = None # delay program creation
325
326	def __getstate__(self):
327	return self.info, self.source, self.dtype, self.fast
328
329	def __setstate__(self, state):
330	self.info, self.source, self.dtype, self.fast = state
331	self.program = None
332
333	def make_kernel(self, q_vectors):
334	if self.program is None:
335	compiler = environment().compile_program
336	self.program = compiler(self.info['name'], self.source, self.dtype,
337	self.fast)
338	is_2d = len(q_vectors) == 2
339	kernel_name = generate.kernel_name(self.info, is_2d)
340	kernel = getattr(self.program, kernel_name)
341	return GpuKernel(kernel, self.info, q_vectors, self.dtype)
342
343	def release(self):
344	"""
345	Free the resources associated with the model.
346	"""
347	if self.program is not None:
348	environment().release_program(self.info['name'])
349	self.program = None
350
351	def __del__(self):
352	self.release()
353
354	# TODO: check that we don't need a destructor for buffers which go out of scope
355	class GpuInput(object):
356	"""
357	Make q data available to the gpu.
358
359	q_vectors is a list of q vectors, which will be [q] for 1-D data,
360	and [qx, qy] for 2-D data. Internally, the vectors will be reallocated
361	to get the best performance on OpenCL, which may involve shifting and
362	stretching the array to better match the memory architecture. Additional
363	points will be evaluated with q=1e-3.
364
365	dtype is the data type for the q vectors. The data type should be
366	set to match that of the kernel, which is an attribute of
367	:class:`GpuProgram`. Note that not all kernels support double
368	precision, so even if the program was created for double precision,
369	the GpuProgram.dtype may be single precision.
370
371	Call :meth:`release` when complete. Even if not called directly, the
372	buffer will be released when the data object is freed.
373	"""
374	def __init__(self, q_vectors, dtype=generate.F32):
375	# TODO: do we ever need double precision q?
376	env = environment()
377	self.nq = q_vectors[0].size
378	self.dtype = np.dtype(dtype)
379	self.is_2d = (len(q_vectors) == 2)
380	# TODO: stretch input based on get_warp()
381	# not doing it now since warp depends on kernel, which is not known
382	# at this point, so instead using 32, which is good on the set of
383	# architectures tested so far.
384	self.q_vectors = [_stretch_input(q, self.dtype, 32) for q in q_vectors]
385	context = env.get_context(self.dtype)
386	self.global_size = [self.q_vectors[0].size]
387	#print("creating inputs of size", self.global_size)
388	self.q_buffers = [
389	cl.Buffer(context, mf.READ_ONLY \| mf.COPY_HOST_PTR, hostbuf=q)
390	for q in self.q_vectors
391	]
392
393	def release(self):
394	"""
395	Free the memory.
396	"""
397	for b in self.q_buffers:
398	b.release()
399	self.q_buffers = []
400
401	def __del__(self):
402	self.release()
403
404	class GpuKernel(object):
405	"""
406	Callable SAS kernel.
407
408	kernel is the GpuKernel object to call
409
410	model_info is the module information
411
412	q_vectors is the q vectors at which the kernel should be evaluated
413
414	dtype is the kernel precision
415
416	The resulting call method takes the pars, a list of values for
417	the fixed parameters to the kernel, and pd_pars, a list of (value,weight)
418	vectors for the polydisperse parameters. cutoff determines the
419	integration limits: any points with combined weight less than cutoff
420	will not be calculated.
421
422	Call :meth:`release` when done with the kernel instance.
423	"""
424	def __init__(self, kernel, model_info, q_vectors, dtype):
425	q_input = GpuInput(q_vectors, dtype)
426	self.kernel = kernel
427	self.info = model_info
428	self.res = np.empty(q_input.nq, q_input.dtype)
429	dim = '2d' if q_input.is_2d else '1d'
430	self.fixed_pars = model_info['partype']['fixed-' + dim]
431	self.pd_pars = model_info['partype']['pd-' + dim]
432
433	# Inputs and outputs for each kernel call
434	# Note: res may be shorter than res_b if global_size != nq
435	env = environment()
436	self.queue = env.get_queue(dtype)
437	self.loops_b = cl.Buffer(self.queue.context, mf.READ_WRITE,
438	2 * MAX_LOOPS * q_input.dtype.itemsize)
439	self.res_b = cl.Buffer(self.queue.context, mf.READ_WRITE,
440	q_input.global_size[0] * q_input.dtype.itemsize)
441	self.q_input = q_input
442
443	self._need_release = [self.loops_b, self.res_b, self.q_input]
444
445	def __call__(self, fixed_pars, pd_pars, cutoff):
446	real = (np.float32 if self.q_input.dtype == generate.F32
447	else np.float64 if self.q_input.dtype == generate.F64
448	else np.float16 if self.q_input.dtype == generate.F16
449	else np.float32) # will never get here, so use np.float32
450
451	#print "pars", fixed_pars, pd_pars
452	res_bi = self.res_b
453	nq = np.uint32(self.q_input.nq)
454	if pd_pars:
455	cutoff = real(cutoff)
456	loops_N = [np.uint32(len(p[0])) for p in pd_pars]
457	loops = np.hstack(pd_pars) \
458	if pd_pars else np.empty(0, dtype=self.q_input.dtype)
459	loops = np.ascontiguousarray(loops.T, self.q_input.dtype).flatten()
460	#print("loops",Nloops, loops)
461
462	#import sys; print("opencl eval",pars)
463	#print("opencl eval",pars)
464	if len(loops) > 2 * MAX_LOOPS:
465	raise ValueError("too many polydispersity points")
466
467	loops_bi = self.loops_b
468	cl.enqueue_copy(self.queue, loops_bi, loops)
469	loops_l = cl.LocalMemory(len(loops.data))
470	#ctx = environment().context
471	#loops_bi = cl.Buffer(ctx, mf.READ_ONLY\|mf.COPY_HOST_PTR, hostbuf=loops)
472	dispersed = [loops_bi, loops_l, cutoff] + loops_N
473	else:
474	dispersed = []
475	fixed = [real(p) for p in fixed_pars]
476	args = self.q_input.q_buffers + [res_bi, nq] + dispersed + fixed
477	self.kernel(self.queue, self.q_input.global_size, None, *args)
478	cl.enqueue_copy(self.queue, self.res, res_bi)
479
480	return self.res
481
482	def release(self):
483	"""
484	Release resources associated with the kernel.
485	"""
486	for v in self._need_release:
487	v.release()
488	self._need_release = []
489
490	def __del__(self):
491	self.release()

Note: See TracBrowser for help on using the repository browser.

SasView

source: sasmodels/sasmodels/kernelcl.py @ 33af590

Download in other formats: