kernelcuda.py @ 0db7dbd

core_shell_microgelsmagnetic_modelticket-1257-vesicle-productticket_1156ticket_1265_superballticket_822_more_unit_tests

Last change on this file since 0db7dbd was 0db7dbd, checked in by pkienzle, 6 years ago
cuda support: allow cylinder model to run under CUDA as well as OpenCL
Property mode set to `100644`
File size: 18.0 KB

Line
1	"""
2	GPU driver for C kernels
3
4	There should be a single GPU environment running on the system. This
5	environment is constructed on the first call to :func:`env`, and the
6	same environment is returned on each call.
7
8	After retrieving the environment, the next step is to create the kernel.
9	This is done with a call to :meth:`GpuEnvironment.make_kernel`, which
10	returns the type of data used by the kernel.
11
12	Next a :class:`GpuData` object should be created with the correct kind
13	of data. This data object can be used by multiple kernels, for example,
14	if the target model is a weighted sum of multiple kernels. The data
15	should include any extra evaluation points required to compute the proper
16	data smearing. This need not match the square grid for 2D data if there
17	is an index saying which q points are active.
18
19	Together the GpuData, the program, and a device form a :class:`GpuKernel`.
20	This kernel is used during fitting, receiving new sets of parameters and
21	evaluating them. The output value is stored in an output buffer on the
22	devices, where it can be combined with other structure factors and form
23	factors and have instrumental resolution effects applied.
24
25	In order to use OpenCL for your models, you will need OpenCL drivers for
26	your machine. These should be available from your graphics card vendor.
27	Intel provides OpenCL drivers for CPUs as well as their integrated HD
28	graphics chipsets. AMD also provides drivers for Intel CPUs, but as of
29	this writing the performance is lacking compared to the Intel drivers.
30	NVidia combines drivers for CUDA and OpenCL in one package. The result
31	is a bit messy if you have multiple drivers installed. You can see which
32	drivers are available by starting python and running:
33
34	import pyopencl as cl
35	cl.create_some_context(interactive=True)
36
37	Once you have done that, it will show the available drivers which you
38	can select. It will then tell you that you can use these drivers
39	automatically by setting the SAS_OPENCL environment variable, which is
40	PYOPENCL_CTX equivalent but not conflicting with other pyopnecl programs.
41
42	Some graphics cards have multiple devices on the same card. You cannot
43	yet use both of them concurrently to evaluate models, but you can run
44	the program twice using a different device for each session.
45
46	OpenCL kernels are compiled when needed by the device driver. Some
47	drivers produce compiler output even when there is no error. You
48	can see the output by setting PYOPENCL_COMPILER_OUTPUT=1. It should be
49	harmless, albeit annoying.
50	"""
51	from __future__ import print_function
52
53	import os
54	import warnings
55	import logging
56	import time
57
58	import numpy as np # type: ignore
59
60
61	# Attempt to setup opencl. This may fail if the opencl package is not
62	# installed or if it is installed but there are no devices available.
63	try:
64	import pycuda.autoinit
65	import pycuda.driver as cuda # type: ignore
66	from pycuda.compiler import SourceModule
67	HAVE_CUDA = True
68	CUDA_ERROR = ""
69	except Exception as exc:
70	HAVE_CUDA = False
71	CUDA_ERROR = str(exc)
72
73	from . import generate
74	from .kernel import KernelModel, Kernel
75
76	# pylint: disable=unused-import
77	try:
78	from typing import Tuple, Callable, Any
79	from .modelinfo import ModelInfo
80	from .details import CallDetails
81	except ImportError:
82	pass
83	# pylint: enable=unused-import
84
85	# The max loops number is limited by the amount of local memory available
86	# on the device. You don't want to make this value too big because it will
87	# waste resources, nor too small because it may interfere with users trying
88	# to do their polydispersity calculations. A value of 1024 should be much
89	# larger than necessary given that cost grows as npts^k where k is the number
90	# of polydisperse parameters.
91	MAX_LOOPS = 2048
92
93
94	# Pragmas for enable OpenCL features. Be sure to protect them so that they
95	# still compile even if OpenCL is not present.
96	_F16_PRAGMA = """\
97	#if defined(__OPENCL_VERSION__) // && !defined(cl_khr_fp16)
98	# pragma OPENCL EXTENSION cl_khr_fp16: enable
99	#endif
100	"""
101
102	_F64_PRAGMA = """\
103	#if defined(__OPENCL_VERSION__) // && !defined(cl_khr_fp64)
104	# pragma OPENCL EXTENSION cl_khr_fp64: enable
105	#endif
106	"""
107
108	def use_cuda():
109	return HAVE_CUDA
110
111	ENV = None
112	def reset_environment():
113	"""
114	Call to create a new OpenCL context, such as after a change to SAS_OPENCL.
115	"""
116	global ENV
117	ENV = GpuEnvironment() if use_cuda() else None
118
119	def environment():
120	# type: () -> "GpuEnvironment"
121	"""
122	Returns a singleton :class:`GpuEnvironment`.
123
124	This provides an OpenCL context and one queue per device.
125	"""
126	if ENV is None:
127	if not HAVE_CUDA:
128	raise RuntimeError("OpenCL startup failed with ***"
129	+ CUDA_ERROR + "***; using C compiler instead")
130	reset_environment()
131	if ENV is None:
132	raise RuntimeError("SAS_OPENCL=None in environment")
133	return ENV
134
135	def _stretch_input(vector, dtype, extra=1e-3, boundary=32):
136	# type: (np.ndarray, np.dtype, float, int) -> np.ndarray
137	"""
138	Stretch an input vector to the correct boundary.
139
140	Performance on the kernels can drop by a factor of two or more if the
141	number of values to compute does not fall on a nice power of two
142	boundary. The trailing additional vector elements are given a
143	value of extra, and so f(extra) will be computed for each of
144	them. The returned array will thus be a subset of the computed array.
145
146	boundary should be a power of 2 which is at least 32 for good
147	performance on current platforms (as of Jan 2015). It should
148	probably be the max of get_warp(kernel,queue) and
149	device.min_data_type_align_size//4.
150	"""
151	remainder = vector.size % boundary
152	if remainder != 0:
153	size = vector.size + (boundary - remainder)
154	vector = np.hstack((vector, [extra] * (size - vector.size)))
155	return np.ascontiguousarray(vector, dtype=dtype)
156
157
158	def compile_model(source, dtype, fast=False):
159	# type: (str, np.dtype, bool) -> cl.Program
160	"""
161	Build a model to run on the gpu.
162
163	Returns the compiled program and its type. The returned type will
164	be float32 even if the desired type is float64 if any of the
165	devices in the context do not support the cl_khr_fp64 extension.
166	"""
167	source_list = [generate.convert_type(source, dtype)]
168
169	if dtype == generate.F16:
170	source_list.insert(0, _F16_PRAGMA)
171	elif dtype == generate.F64:
172	source_list.insert(0, _F64_PRAGMA)
173
174	source_list.insert(0, "#define USE_SINCOS\n")
175	source = "\n".join(source_list)
176	program = SourceModule(source) # no_extern_c=True, include_dirs=[...]
177	return program
178
179
180	# for now, this returns one device in the context
181	# TODO: create a context that contains all devices on all platforms
182	class GpuEnvironment(object):
183	"""
184	GPU context, with possibly many devices, and one queue per device.
185	"""
186	def __init__(self, devnum=0):
187	# type: () -> None
188	# Byte boundary for data alignment
189	#self.data_boundary = max(d.min_data_type_align_size
190	# for d in self.context.devices)
191	self.compiled = {}
192	#self.device = cuda.Device(devnum)
193	#self.context = self.device.make_context()
194
195	def has_type(self, dtype):
196	# type: (np.dtype) -> bool
197	"""
198	Return True if all devices support a given type.
199	"""
200	return True
201
202	def compile_program(self, name, source, dtype, fast, timestamp):
203	# type: (str, str, np.dtype, bool, float) -> cl.Program
204	"""
205	Compile the program for the device in the given context.
206	"""
207	# Note: PyOpenCL caches based on md5 hash of source, options and device
208	# so we don't really need to cache things for ourselves. I'll do so
209	# anyway just to save some data munging time.
210	tag = generate.tag_source(source)
211	key = "%s-%s-%s%s"%(name, dtype, tag, ("-fast" if fast else ""))
212	# Check timestamp on program
213	program, program_timestamp = self.compiled.get(key, (None, np.inf))
214	if program_timestamp < timestamp:
215	del self.compiled[key]
216	if key not in self.compiled:
217	logging.info("building %s for CUDA", key)
218	program = compile_model(str(source), dtype, fast)
219	self.compiled[key] = (program, timestamp)
220	return program
221
222	class GpuModel(KernelModel):
223	"""
224	GPU wrapper for a single model.
225
226	source and model_info are the model source and interface as returned
227	from :func:`generate.make_source` and :func:`generate.make_model_info`.
228
229	dtype is the desired model precision. Any numpy dtype for single
230	or double precision floats will do, such as 'f', 'float32' or 'single'
231	for single and 'd', 'float64' or 'double' for double. Double precision
232	is an optional extension which may not be available on all devices.
233	Half precision ('float16','half') may be available on some devices.
234	Fast precision ('fast') is a loose version of single precision, indicating
235	that the compiler is allowed to take shortcuts.
236	"""
237	def __init__(self, source, model_info, dtype=generate.F32, fast=False):
238	# type: (Dict[str,str], ModelInfo, np.dtype, bool) -> None
239	self.info = model_info
240	self.source = source
241	self.dtype = dtype
242	self.fast = fast
243	self.program = None # delay program creation
244	self._kernels = None
245
246	def __getstate__(self):
247	# type: () -> Tuple[ModelInfo, str, np.dtype, bool]
248	return self.info, self.source, self.dtype, self.fast
249
250	def __setstate__(self, state):
251	# type: (Tuple[ModelInfo, str, np.dtype, bool]) -> None
252	self.info, self.source, self.dtype, self.fast = state
253	self.program = None
254
255	def make_kernel(self, q_vectors):
256	# type: (List[np.ndarray]) -> "GpuKernel"
257	if self.program is None:
258	compile_program = environment().compile_program
259	timestamp = generate.ocl_timestamp(self.info)
260	self.program = compile_program(
261	self.info.name,
262	self.source['opencl'],
263	self.dtype,
264	self.fast,
265	timestamp)
266	variants = ['Iq', 'Iqxy', 'Imagnetic']
267	names = [generate.kernel_name(self.info, k) for k in variants]
268	kernels = [self.program.get_function(k) for k in names]
269	self._kernels = dict((k, v) for k, v in zip(variants, kernels))
270	is_2d = len(q_vectors) == 2
271	if is_2d:
272	kernel = [self._kernels['Iqxy'], self._kernels['Imagnetic']]
273	else:
274	kernel = [self._kernels['Iq']]*2
275	return GpuKernel(kernel, self.dtype, self.info, q_vectors)
276
277	def release(self):
278	# type: () -> None
279	"""
280	Free the resources associated with the model.
281	"""
282	if self.program is not None:
283	self.program = None
284
285	def __del__(self):
286	# type: () -> None
287	self.release()
288
289	# TODO: check that we don't need a destructor for buffers which go out of scope
290	class GpuInput(object):
291	"""
292	Make q data available to the gpu.
293
294	q_vectors is a list of q vectors, which will be [q] for 1-D data,
295	and [qx, qy] for 2-D data. Internally, the vectors will be reallocated
296	to get the best performance on OpenCL, which may involve shifting and
297	stretching the array to better match the memory architecture. Additional
298	points will be evaluated with q=1e-3.
299
300	dtype is the data type for the q vectors. The data type should be
301	set to match that of the kernel, which is an attribute of
302	:class:`GpuProgram`. Note that not all kernels support double
303	precision, so even if the program was created for double precision,
304	the GpuProgram.dtype may be single precision.
305
306	Call :meth:`release` when complete. Even if not called directly, the
307	buffer will be released when the data object is freed.
308	"""
309	def __init__(self, q_vectors, dtype=generate.F32):
310	# type: (List[np.ndarray], np.dtype) -> None
311	# TODO: do we ever need double precision q?
312	self.nq = q_vectors[0].size
313	self.dtype = np.dtype(dtype)
314	self.is_2d = (len(q_vectors) == 2)
315	# TODO: stretch input based on get_warp()
316	# not doing it now since warp depends on kernel, which is not known
317	# at this point, so instead using 32, which is good on the set of
318	# architectures tested so far.
319	if self.is_2d:
320	# Note: 16 rather than 15 because result is 1 longer than input.
321	width = ((self.nq+16)//16)*16
322	self.q = np.empty((width, 2), dtype=dtype)
323	self.q[:self.nq, 0] = q_vectors[0]
324	self.q[:self.nq, 1] = q_vectors[1]
325	else:
326	# Note: 32 rather than 31 because result is 1 longer than input.
327	width = ((self.nq+32)//32)*32
328	self.q = np.empty(width, dtype=dtype)
329	self.q[:self.nq] = q_vectors[0]
330	self.global_size = [self.q.shape[0]]
331	#print("creating inputs of size", self.global_size)
332	self.q_b = cuda.to_device(self.q)
333
334	def release(self):
335	# type: () -> None
336	"""
337	Free the memory.
338	"""
339	if self.q_b is not None:
340	self.q_b.free()
341	self.q_b = None
342
343	def __del__(self):
344	# type: () -> None
345	self.release()
346
347	class GpuKernel(Kernel):
348	"""
349	Callable SAS kernel.
350
351	kernel is the GpuKernel object to call
352
353	model_info is the module information
354
355	q_vectors is the q vectors at which the kernel should be evaluated
356
357	dtype is the kernel precision
358
359	The resulting call method takes the pars, a list of values for
360	the fixed parameters to the kernel, and pd_pars, a list of (value,weight)
361	vectors for the polydisperse parameters. cutoff determines the
362	integration limits: any points with combined weight less than cutoff
363	will not be calculated.
364
365	Call :meth:`release` when done with the kernel instance.
366	"""
367	def __init__(self, kernel, dtype, model_info, q_vectors):
368	# type: (cl.Kernel, np.dtype, ModelInfo, List[np.ndarray]) -> None
369	q_input = GpuInput(q_vectors, dtype)
370	self.kernel = kernel
371	self.info = model_info
372	self.dtype = dtype
373	self.dim = '2d' if q_input.is_2d else '1d'
374	# plus three for the normalization values
375	self.result = np.empty(q_input.nq+1, dtype)
376
377	# Inputs and outputs for each kernel call
378	# Note: res may be shorter than res_b if global_size != nq
379	self.result_b = cuda.mem_alloc(q_input.global_size[0] * dtype.itemsize)
380	self.q_input = q_input # allocated by GpuInput above
381
382	self._need_release = [self.result_b, self.q_input]
383	self.real = (np.float32 if dtype == generate.F32
384	else np.float64 if dtype == generate.F64
385	else np.float16 if dtype == generate.F16
386	else np.float32) # will never get here, so use np.float32
387
388	def __call__(self, call_details, values, cutoff, magnetic):
389	# type: (CallDetails, np.ndarray, np.ndarray, float, bool) -> np.ndarray
390	# Arrange data transfer to card
391	details_b = cuda.to_device(call_details.buffer)
392	values_b = cuda.to_device(values)
393
394	kernel = self.kernel[1 if magnetic else 0]
395	args = [
396	np.uint32(self.q_input.nq), None, None,
397	details_b, values_b, self.q_input.q_b, self.result_b,
398	self.real(cutoff),
399	]
400	grid = partition(self.q_input.nq)
401	#print("Calling OpenCL")
402	#call_details.show(values)
403	# Call kernel and retrieve results
404	last_nap = time.clock()
405	step = 1000000//self.q_input.nq + 1
406	#step = 1000000000
407	for start in range(0, call_details.num_eval, step):
408	stop = min(start + step, call_details.num_eval)
409	#print("queuing",start,stop)
410	args[1:3] = [np.int32(start), np.int32(stop)]
411	kernel(args, *grid)
412	if stop < call_details.num_eval:
413	sync()
414	# Allow other processes to run
415	current_time = time.clock()
416	if current_time - last_nap > 0.5:
417	time.sleep(0.05)
418	last_nap = current_time
419	sync()
420	details_b.free()
421	values_b.free()
422	cuda.memcpy_dtoh(self.result, self.result_b)
423	#print("result", self.result)
424
425	pd_norm = self.result[self.q_input.nq]
426	scale = values[0]/(pd_norm if pd_norm != 0.0 else 1.0)
427	background = values[1]
428	#print("scale",scale,values[0],self.result[self.q_input.nq],background)
429	return scale*self.result[:self.q_input.nq] + background
430	# return self.result[:self.q_input.nq]
431
432	def release(self):
433	# type: () -> None
434	"""
435	Release resources associated with the kernel.
436	"""
437	if self.result_b is not None:
438	self.result_b.free()
439	self.result_b = None
440
441	def __del__(self):
442	# type: () -> None
443	self.release()
444
445
446	def sync():
447	"""
448	Overview:
449	Waits for operation in the current context to complete.
450
451	Note: Maybe context.synchronize() is sufficient.
452	"""
453	#return # The following works in C++; don't know what pycuda is doing
454	# Create an event with which to synchronize
455	done = cuda.Event()
456
457	# Schedule an event trigger on the GPU.
458	done.record()
459
460	#line added to not hog resources
461	while not done.query(): time.sleep(0.01)
462
463	# Block until the GPU executes the kernel.
464	done.synchronize()
465	# Clean up the event; I don't think they can be reused.
466	del done
467
468
469	def partition(n):
470	'''
471	Overview:
472	Auto grids the thread blocks to achieve some level of calculation
473	efficiency.
474	'''
475	max_gx,max_gy = 65535,65535
476	blocksize = 32
477	#max_gx,max_gy = 5,65536
478	#blocksize = 3
479	block = (blocksize,1,1)
480	num_blocks = int((n+blocksize-1)/blocksize)
481	if num_blocks < max_gx:
482	grid = (num_blocks,1)
483	else:
484	gx = max_gx
485	gy = (num_blocks + max_gx - 1) / max_gx
486	if gy >= max_gy: raise ValueError("vector is too large")
487	grid = (gx,gy)
488	#print "block",block,"grid",grid
489	#print "waste",block[0]block[1]block[2]grid[0]grid[1] - n
490	return dict(block=block,grid=grid)
491

Note: See TracBrowser for help on using the repository browser.

SasView

source: sasmodels/sasmodels/kernelcuda.py @ 0db7dbd

Download in other formats: