kernelcuda.py @ 3199b17

core_shell_microgelsmagnetic_modelticket-1257-vesicle-productticket_1156ticket_1265_superballticket_822_more_unit_tests

Last change on this file since 3199b17 was 3199b17, checked in by Paul Kienzle <pkienzle@…>, 5 years ago
PEP 8 changes and improved consistency between OpenCL/CUDA/DLL code
Property mode set to `100644`
File size: 20.2 KB

Line
1	"""
2	GPU driver for C kernels (with CUDA)
3
4	To select cuda, use SAS_OPENCL=cuda, or SAS_OPENCL=cuda:n for a particular
5	device number. If no device number is specified, then look for CUDA_DEVICE=n
6	or a file ~/.cuda-device containing n for the device number. Otherwise, try
7	all available device numbers.
8
9	TODO: docs are out of date
10
11	There should be a single GPU environment running on the system. This
12	environment is constructed on the first call to :func:`env`, and the
13	same environment is returned on each call.
14
15	After retrieving the environment, the next step is to create the kernel.
16	This is done with a call to :meth:`GpuEnvironment.make_kernel`, which
17	returns the type of data used by the kernel.
18
19	Next a :class:`GpuData` object should be created with the correct kind
20	of data. This data object can be used by multiple kernels, for example,
21	if the target model is a weighted sum of multiple kernels. The data
22	should include any extra evaluation points required to compute the proper
23	data smearing. This need not match the square grid for 2D data if there
24	is an index saying which q points are active.
25
26	Together the GpuData, the program, and a device form a :class:`GpuKernel`.
27	This kernel is used during fitting, receiving new sets of parameters and
28	evaluating them. The output value is stored in an output buffer on the
29	devices, where it can be combined with other structure factors and form
30	factors and have instrumental resolution effects applied.
31
32	In order to use OpenCL for your models, you will need OpenCL drivers for
33	your machine. These should be available from your graphics card vendor.
34	Intel provides OpenCL drivers for CPUs as well as their integrated HD
35	graphics chipsets. AMD also provides drivers for Intel CPUs, but as of
36	this writing the performance is lacking compared to the Intel drivers.
37	NVidia combines drivers for CUDA and OpenCL in one package. The result
38	is a bit messy if you have multiple drivers installed. You can see which
39	drivers are available by starting python and running:
40
41	import pyopencl as cl
42	cl.create_some_context(interactive=True)
43
44	Once you have done that, it will show the available drivers which you
45	can select. It will then tell you that you can use these drivers
46	automatically by setting the SAS_OPENCL environment variable, which is
47	PYOPENCL_CTX equivalent but not conflicting with other pyopnecl programs.
48
49	Some graphics cards have multiple devices on the same card. You cannot
50	yet use both of them concurrently to evaluate models, but you can run
51	the program twice using a different device for each session.
52
53	OpenCL kernels are compiled when needed by the device driver. Some
54	drivers produce compiler output even when there is no error. You
55	can see the output by setting PYOPENCL_COMPILER_OUTPUT=1. It should be
56	harmless, albeit annoying.
57	"""
58	from __future__ import print_function
59
60	import os
61	import warnings
62	import logging
63	import time
64	import re
65	import atexit
66
67	import numpy as np # type: ignore
68
69
70	# Attempt to setup CUDA. This may fail if the pycuda package is not
71	# installed or if it is installed but there are no devices available.
72	try:
73	import pycuda.driver as cuda # type: ignore
74	from pycuda.compiler import SourceModule
75	from pycuda.tools import make_default_context, clear_context_caches
76	# Ask CUDA for the default context (so that we know that one exists)
77	# then immediately throw it away in case the user doesn't want it.
78	# Note: cribbed from pycuda.autoinit
79	cuda.init()
80	context = make_default_context()
81	context.pop()
82	clear_context_caches()
83	del context
84	HAVE_CUDA = True
85	CUDA_ERROR = ""
86	except Exception as exc:
87	HAVE_CUDA = False
88	CUDA_ERROR = str(exc)
89
90	from . import generate
91	from .kernel import KernelModel, Kernel
92
93	# pylint: disable=unused-import
94	try:
95	from typing import Tuple, Callable, Any
96	from .modelinfo import ModelInfo
97	from .details import CallDetails
98	except ImportError:
99	pass
100	# pylint: enable=unused-import
101
102	# The max loops number is limited by the amount of local memory available
103	# on the device. You don't want to make this value too big because it will
104	# waste resources, nor too small because it may interfere with users trying
105	# to do their polydispersity calculations. A value of 1024 should be much
106	# larger than necessary given that cost grows as npts^k where k is the number
107	# of polydisperse parameters.
108	MAX_LOOPS = 2048
109
110
111	def use_cuda():
112	sas_opencl = os.environ.get("SAS_OPENCL", "CUDA").lower()
113	return HAVE_CUDA and sas_opencl.startswith("cuda")
114
115
116	ENV = None
117	def reset_environment():
118	"""
119	Call to create a new OpenCL context, such as after a change to SAS_OPENCL.
120	"""
121	global ENV
122	# Free any previous allocated context.
123	if ENV is not None and ENV.context is not None:
124	ENV.release()
125	ENV = GpuEnvironment() if use_cuda() else None
126
127
128	def environment():
129	# type: () -> "GpuEnvironment"
130	"""
131	Returns a singleton :class:`GpuEnvironment`.
132
133	This provides an OpenCL context and one queue per device.
134	"""
135	if ENV is None:
136	if not HAVE_CUDA:
137	raise RuntimeError("CUDA startup failed with ***"
138	+ CUDA_ERROR + "***; using C compiler instead")
139	reset_environment()
140	if ENV is None:
141	raise RuntimeError("SAS_OPENCL=None in environment")
142	return ENV
143
144
145	def has_type(dtype):
146	# type: (np.dtype) -> bool
147	"""
148	Return true if device supports the requested precision.
149	"""
150	# Assume the NVIDIA card supports 32-bit and 64-bit floats.
151	# TODO: Check if pycuda support F16.
152	return dtype in (generate.F32, generate.F64)
153
154
155	FUNCTION_PATTERN = re.compile(r"""^
156	(?P<space>\s*) # Initial space.
157	(?P<qualifiers>^(?:\s\b\w+\b\s)+) # One or more qualifiers before function.
158	(?P<function>\s\b\w+\b\s[(]) # Function name plus open parens.
159	""", re.VERBOSE\|re.MULTILINE)
160
161	MARKED_PATTERN = re.compile(r"""
162	\b(return\|else\|kernel\|device\|__device__)\b
163	""", re.VERBOSE\|re.MULTILINE)
164
165
166	def _add_device_tag(match):
167	# type: (None) -> str
168	# Note: Should be re.Match, but that isn't a simple type.
169	"""
170	replace qualifiers with __device__ qualifiers if needed
171	"""
172	qualifiers = match.group("qualifiers")
173	if MARKED_PATTERN.search(qualifiers):
174	start, end = match.span()
175	return match.string[start:end]
176	else:
177	function = match.group("function")
178	space = match.group("space")
179	return "".join((space, "__device__ ", qualifiers, function))
180
181
182	def mark_device_functions(source):
183	# type: (str) -> str
184	"""
185	Mark all function declarations as __device__ functions (except kernel).
186	"""
187	return FUNCTION_PATTERN.sub(_add_device_tag, source)
188
189
190	def show_device_functions(source):
191	# type: (str) -> str
192	"""
193	Show all discovered function declarations, but don't change any.
194	"""
195	for match in FUNCTION_PATTERN.finditer(source):
196	print(match.group('qualifiers').replace('\n',r'\n'), match.group('function'), '(')
197	return source
198
199
200	def compile_model(source, dtype, fast=False):
201	# type: (str, np.dtype, bool) -> SourceModule
202	"""
203	Build a model to run on the gpu.
204
205	Returns the compiled program and its type. The returned type will
206	be float32 even if the desired type is float64 if any of the
207	devices in the context do not support the cl_khr_fp64 extension.
208	"""
209	dtype = np.dtype(dtype)
210	if not has_type(dtype):
211	raise RuntimeError("%s not supported for devices"%dtype)
212
213	source_list = [generate.convert_type(source, dtype)]
214
215	source_list.insert(0, "#define USE_SINCOS\n")
216	source = "\n".join(source_list)
217	#source = show_device_functions(source)
218	source = mark_device_functions(source)
219	#with open('/tmp/kernel.cu', 'w') as fd: fd.write(source)
220	#print(source)
221	#options = ['--verbose', '-E']
222	options = ['--use_fast_math'] if fast else None
223	program = SourceModule(source, no_extern_c=True, options=options) #, include_dirs=[...])
224
225	#print("done with "+program)
226	return program
227
228
229	# For now, this returns one device in the context.
230	# TODO: Create a context that contains all devices on all platforms.
231	class GpuEnvironment(object):
232	"""
233	GPU context for CUDA.
234	"""
235	context = None # type: cuda.Context
236	def __init__(self, devnum=None):
237	# type: (int) -> None
238	env = os.environ.get("SAS_OPENCL", "").lower()
239	if devnum is None and env.startswith("cuda:"):
240	devnum = int(env[5:])
241
242	# Set the global context to the particular device number if one is
243	# given, otherwise use the default context. Perhaps this will be set
244	# by an environment variable within autoinit.
245	if devnum is not None:
246	self.context = cuda.Device(devnum).make_context()
247	else:
248	self.context = make_default_context()
249
250	## Byte boundary for data alignment.
251	#self.data_boundary = max(d.min_data_type_align_size
252	# for d in self.context.devices)
253
254	# Cache for compiled programs, and for items in context.
255	self.compiled = {}
256
257	def release(self):
258	if self.context is not None:
259	self.context.pop()
260	self.context = None
261
262	def __del__(self):
263	self.release()
264
265	def has_type(self, dtype):
266	# type: (np.dtype) -> bool
267	"""
268	Return True if all devices support a given type.
269	"""
270	return has_type(dtype)
271
272	def compile_program(self, name, source, dtype, fast, timestamp):
273	# type: (str, str, np.dtype, bool, float) -> cl.Program
274	"""
275	Compile the program for the device in the given context.
276	"""
277	# Note: PyCuda (probably) caches but I'll do so as well just to
278	# save some data munging time.
279	tag = generate.tag_source(source)
280	key = "%s-%s-%s%s"%(name, dtype, tag, ("-fast" if fast else ""))
281	# Check timestamp on program.
282	program, program_timestamp = self.compiled.get(key, (None, np.inf))
283	if program_timestamp < timestamp:
284	del self.compiled[key]
285	if key not in self.compiled:
286	logging.info("building %s for CUDA", key)
287	program = compile_model(str(source), dtype, fast)
288	self.compiled[key] = (program, timestamp)
289	return program
290
291
292	class GpuModel(KernelModel):
293	"""
294	GPU wrapper for a single model.
295
296	source and model_info are the model source and interface as returned
297	from :func:`generate.make_source` and :func:`generate.make_model_info`.
298
299	dtype is the desired model precision. Any numpy dtype for single
300	or double precision floats will do, such as 'f', 'float32' or 'single'
301	for single and 'd', 'float64' or 'double' for double. Double precision
302	is an optional extension which may not be available on all devices.
303	Half precision ('float16','half') may be available on some devices.
304	Fast precision ('fast') is a loose version of single precision, indicating
305	that the compiler is allowed to take shortcuts.
306	"""
307	info = None # type: ModelInfo
308	source = "" # type: str
309	dtype = None # type: np.dtype
310	fast = False # type: bool
311	_program = None # type: SourceModule
312	_kernels = None # type: Dict[str, cuda.Function]
313
314	def __init__(self, source, model_info, dtype=generate.F32, fast=False):
315	# type: (Dict[str,str], ModelInfo, np.dtype, bool) -> None
316	self.info = model_info
317	self.source = source
318	self.dtype = dtype
319	self.fast = fast
320
321	def __getstate__(self):
322	# type: () -> Tuple[ModelInfo, str, np.dtype, bool]
323	return self.info, self.source, self.dtype, self.fast
324
325	def __setstate__(self, state):
326	# type: (Tuple[ModelInfo, str, np.dtype, bool]) -> None
327	self.info, self.source, self.dtype, self.fast = state
328	self._program = self._kernels = None
329
330	def make_kernel(self, q_vectors):
331	# type: (List[np.ndarray]) -> "GpuKernel"
332	return GpuKernel(self, q_vectors)
333
334	def get_function(self, name):
335	# type: (str) -> cuda.Function
336	"""
337	Fetch the kernel from the environment by name, compiling it if it
338	does not already exist.
339	"""
340	if self._program is None:
341	self._prepare_program()
342	return self._kernels[name]
343
344	def _prepare_program(self):
345	# type: (str) -> None
346	env = environment()
347	timestamp = generate.ocl_timestamp(self.info)
348	program = env.compile_program(
349	self.info.name,
350	self.source['opencl'],
351	self.dtype,
352	self.fast,
353	timestamp)
354	variants = ['Iq', 'Iqxy', 'Imagnetic']
355	names = [generate.kernel_name(self.info, k) for k in variants]
356	functions = [program.get_function(k) for k in names]
357	self._kernels = {k: v for k, v in zip(variants, functions)}
358	# Keep a handle to program so GC doesn't collect.
359	self._program = program
360
361
362	# TODO: Check that we don't need a destructor for buffers which go out of scope.
363	class GpuInput(object):
364	"""
365	Make q data available to the gpu.
366
367	q_vectors is a list of q vectors, which will be [q] for 1-D data,
368	and [qx, qy] for 2-D data. Internally, the vectors will be reallocated
369	to get the best performance on OpenCL, which may involve shifting and
370	stretching the array to better match the memory architecture. Additional
371	points will be evaluated with q=1e-3.
372
373	dtype is the data type for the q vectors. The data type should be
374	set to match that of the kernel, which is an attribute of
375	:class:`GpuProgram`. Note that not all kernels support double
376	precision, so even if the program was created for double precision,
377	the GpuProgram.dtype may be single precision.
378
379	Call :meth:`release` when complete. Even if not called directly, the
380	buffer will be released when the data object is freed.
381	"""
382	def __init__(self, q_vectors, dtype=generate.F32):
383	# type: (List[np.ndarray], np.dtype) -> None
384	# TODO: Do we ever need double precision q?
385	self.nq = q_vectors[0].size
386	self.dtype = np.dtype(dtype)
387	self.is_2d = (len(q_vectors) == 2)
388	# TODO: Stretch input based on get_warp().
389	# Not doing it now since warp depends on kernel, which is not known
390	# at this point, so instead using 32, which is good on the set of
391	# architectures tested so far.
392	if self.is_2d:
393	width = ((self.nq+15)//16)*16
394	self.q = np.empty((width, 2), dtype=dtype)
395	self.q[:self.nq, 0] = q_vectors[0]
396	self.q[:self.nq, 1] = q_vectors[1]
397	else:
398	width = ((self.nq+31)//32)*32
399	self.q = np.empty(width, dtype=dtype)
400	self.q[:self.nq] = q_vectors[0]
401	self.global_size = [self.q.shape[0]]
402	#print("creating inputs of size", self.global_size)
403
404	# Transfer input value to GPU.
405	self.q_b = cuda.to_device(self.q)
406
407	def release(self):
408	# type: () -> None
409	"""
410	Free the buffer associated with the q value.
411	"""
412	if self.q_b is not None:
413	self.q_b.free()
414	self.q_b = None
415
416	def __del__(self):
417	# type: () -> None
418	self.release()
419
420
421	class GpuKernel(Kernel):
422	"""
423	Callable SAS kernel.
424
425	model is the GpuModel object to call
426
427	The kernel is derived from :class:`Kernel`, providing the
428	:meth:`call_kernel` method to evaluate the kernel for a given set of
429	parameters. Because of the need to move the q values to the GPU before
430	evaluation, the kernel is instantiated for a particular set of q vectors,
431	and can be called many times without transfering q each time.
432
433	Call :meth:`release` when done with the kernel instance.
434	"""
435	#: SAS model information structure.
436	info = None # type: ModelInfo
437	#: Kernel precision.
438	dtype = None # type: np.dtype
439	#: Kernel dimensions (1d or 2d).
440	dim = "" # type: str
441	#: Calculation results, updated after each call to :meth:`_call_kernel`.
442	result = None # type: np.ndarray
443
444	def __init__(self, model, q_vectors):
445	# type: (GpuModel, List[np.ndarray]) -> None
446	dtype = model.dtype
447	self.q_input = GpuInput(q_vectors, dtype)
448	self._model = model
449
450	# Attributes accessed from the outside.
451	self.dim = '2d' if self.q_input.is_2d else '1d'
452	self.info = model.info
453	self.dtype = dtype
454
455	# Converter to translate input to target type.
456	self._as_dtype = np.float64 if dtype == generate.F64 else np.float32
457
458	# Holding place for the returned value.
459	nout = 2 if self.info.have_Fq and self.dim == '1d' else 1
460	extra_q = 4 # Total weight, form volume, shell volume and R_eff.
461	self.result = np.empty(self.q_input.nq*nout + extra_q, dtype)
462
463	# Allocate result value on GPU.
464	width = ((self.result.size+31)//32)32 self.dtype.itemsize
465	self._result_b = cuda.mem_alloc(width)
466
467	def _call_kernel(self, call_details, values, cutoff, magnetic,
468	effective_radius_type):
469	# type: (CallDetails, np.ndarray, float, bool, int) -> np.ndarray
470
471	# Arrange data transfer to card.
472	details_b = cuda.to_device(call_details.buffer)
473	values_b = cuda.to_device(values)
474
475	# Setup kernel function and arguments.
476	name = 'Iq' if self.dim == '1d' else 'Imagnetic' if magnetic else 'Iqxy'
477	kernel = self._model.get_function(name)
478	kernel_args = [
479	np.uint32(self.q_input.nq), # Number of inputs.
480	None, # Placeholder for pd_start.
481	None, # Placeholder for pd_stop.
482	details_b, # Problem definition.
483	values_b, # Parameter values.
484	self.q_input.q_b, # Q values.
485	self._result_b, # Result storage.
486	self._as_dtype(cutoff), # Probability cutoff.
487	np.uint32(effective_radius_type), # R_eff mode.
488	]
489	grid = partition(self.q_input.nq)
490
491	# Call kernel and retrieve results.
492	#print("Calling CUDA")
493	#call_details.show(values)
494	last_nap = time.clock()
495	step = 100000000//self.q_input.nq + 1
496	#step = 1000000000
497	for start in range(0, call_details.num_eval, step):
498	stop = min(start + step, call_details.num_eval)
499	#print("queuing",start,stop)
500	kernel_args[1:3] = [np.int32(start), np.int32(stop)]
501	kernel(kernel_args, *grid)
502	if stop < call_details.num_eval:
503	sync()
504	# Allow other processes to run.
505	current_time = time.clock()
506	if current_time - last_nap > 0.5:
507	time.sleep(0.001)
508	last_nap = current_time
509	sync()
510	cuda.memcpy_dtoh(self.result, self._result_b)
511	#print("result", self.result)
512
513	details_b.free()
514	values_b.free()
515
516	def release(self):
517	# type: () -> None
518	"""
519	Release resources associated with the kernel.
520	"""
521	self.q_input.release()
522	if self._result_b is not None:
523	self._result_b.free()
524	self._result_b = None
525
526	def __del__(self):
527	# type: () -> None
528	self.release()
529
530
531	def sync():
532	"""
533	Overview:
534	Waits for operation in the current context to complete.
535
536	Note: Maybe context.synchronize() is sufficient.
537	"""
538	# Create an event with which to synchronize.
539	done = cuda.Event()
540
541	# Schedule an event trigger on the GPU.
542	done.record()
543
544	# Make sure we don't hog resource while waiting to sync.
545	while not done.query():
546	time.sleep(0.01)
547
548	# Block until the GPU executes the kernel.
549	done.synchronize()
550
551	# Clean up the event; I don't think they can be reused.
552	del done
553
554
555	def partition(n):
556	'''
557	Overview:
558	Auto grids the thread blocks to achieve some level of calculation
559	efficiency.
560	'''
561	max_gx, max_gy = 65535, 65535
562	blocksize = 32
563	#max_gx, max_gy = 5, 65536
564	#blocksize = 3
565	block = (blocksize, 1, 1)
566	num_blocks = int((n+blocksize-1)/blocksize)
567	if num_blocks < max_gx:
568	grid = (num_blocks, 1)
569	else:
570	gx = max_gx
571	gy = (num_blocks + max_gx - 1) / max_gx
572	if gy >= max_gy:
573	raise ValueError("vector is too large")
574	grid = (gx, gy)
575	#print("block", block, "grid", grid)
576	#print("waste", block[0]block[1]block[2]grid[0]grid[1] - n)
577	return dict(block=block, grid=grid)

Note: See TracBrowser for help on using the repository browser.

SasView

source: sasmodels/sasmodels/kernelcuda.py @ 3199b17

Download in other formats: