kernelcuda.py @ f872fd1

core_shell_microgelsmagnetic_modelticket-1257-vesicle-productticket_1156ticket_1265_superballticket_822_more_unit_tests

Last change on this file since f872fd1 was f872fd1, checked in by Paul Kienzle <pkienzle@…>, 6 years ago
simplify F32/F64 handling
Property mode set to `100644`
File size: 20.0 KB

Line
1	"""
2	GPU driver for C kernels (with CUDA)
3
4	To select cuda, use SAS_OPENCL=cuda, or SAS_OPENCL=cuda:n for a particular
5	device number. If no device number is specified, then look for CUDA_DEVICE=n
6	or a file ~/.cuda-device containing n for the device number. Otherwise, try
7	all available device numbers.
8
9	TODO: docs are out of date
10
11	There should be a single GPU environment running on the system. This
12	environment is constructed on the first call to :func:`env`, and the
13	same environment is returned on each call.
14
15	After retrieving the environment, the next step is to create the kernel.
16	This is done with a call to :meth:`GpuEnvironment.make_kernel`, which
17	returns the type of data used by the kernel.
18
19	Next a :class:`GpuData` object should be created with the correct kind
20	of data. This data object can be used by multiple kernels, for example,
21	if the target model is a weighted sum of multiple kernels. The data
22	should include any extra evaluation points required to compute the proper
23	data smearing. This need not match the square grid for 2D data if there
24	is an index saying which q points are active.
25
26	Together the GpuData, the program, and a device form a :class:`GpuKernel`.
27	This kernel is used during fitting, receiving new sets of parameters and
28	evaluating them. The output value is stored in an output buffer on the
29	devices, where it can be combined with other structure factors and form
30	factors and have instrumental resolution effects applied.
31
32	In order to use OpenCL for your models, you will need OpenCL drivers for
33	your machine. These should be available from your graphics card vendor.
34	Intel provides OpenCL drivers for CPUs as well as their integrated HD
35	graphics chipsets. AMD also provides drivers for Intel CPUs, but as of
36	this writing the performance is lacking compared to the Intel drivers.
37	NVidia combines drivers for CUDA and OpenCL in one package. The result
38	is a bit messy if you have multiple drivers installed. You can see which
39	drivers are available by starting python and running:
40
41	import pyopencl as cl
42	cl.create_some_context(interactive=True)
43
44	Once you have done that, it will show the available drivers which you
45	can select. It will then tell you that you can use these drivers
46	automatically by setting the SAS_OPENCL environment variable, which is
47	PYOPENCL_CTX equivalent but not conflicting with other pyopnecl programs.
48
49	Some graphics cards have multiple devices on the same card. You cannot
50	yet use both of them concurrently to evaluate models, but you can run
51	the program twice using a different device for each session.
52
53	OpenCL kernels are compiled when needed by the device driver. Some
54	drivers produce compiler output even when there is no error. You
55	can see the output by setting PYOPENCL_COMPILER_OUTPUT=1. It should be
56	harmless, albeit annoying.
57	"""
58	from __future__ import print_function
59
60	import os
61	import warnings
62	import logging
63	import time
64	import re
65
66	import numpy as np # type: ignore
67
68
69	# Attempt to setup cuda. This may fail if the pycuda package is not
70	# installed or if it is installed but there are no devices available.
71	try:
72	import pycuda.driver as cuda # type: ignore
73	from pycuda.compiler import SourceModule
74	from pycuda.tools import make_default_context, clear_context_caches
75	# Ask CUDA for the default context (so that we know that one exists)
76	# then immediately throw it away in case the user doesn't want it.
77	# Note: cribbed from pycuda.autoinit
78	cuda.init()
79	context = make_default_context()
80	context.pop()
81	clear_context_caches()
82	del context
83	HAVE_CUDA = True
84	CUDA_ERROR = ""
85	except Exception as exc:
86	HAVE_CUDA = False
87	CUDA_ERROR = str(exc)
88
89	from . import generate
90	from .kernel import KernelModel, Kernel
91
92	# pylint: disable=unused-import
93	try:
94	from typing import Tuple, Callable, Any
95	from .modelinfo import ModelInfo
96	from .details import CallDetails
97	except ImportError:
98	pass
99	# pylint: enable=unused-import
100
101	# The max loops number is limited by the amount of local memory available
102	# on the device. You don't want to make this value too big because it will
103	# waste resources, nor too small because it may interfere with users trying
104	# to do their polydispersity calculations. A value of 1024 should be much
105	# larger than necessary given that cost grows as npts^k where k is the number
106	# of polydisperse parameters.
107	MAX_LOOPS = 2048
108
109	def use_cuda():
110	env = os.environ.get("SAS_OPENCL", "").lower()
111	return HAVE_CUDA and (env == "" or env.startswith("cuda"))
112
113	ENV = None
114	def reset_environment():
115	"""
116	Call to create a new OpenCL context, such as after a change to SAS_OPENCL.
117	"""
118	global ENV
119	# Free any previous allocated context.
120	if ENV is not None and ENV.context is not None:
121	ENV.release()
122	ENV = GpuEnvironment() if use_cuda() else None
123
124	def environment():
125	# type: () -> "GpuEnvironment"
126	"""
127	Returns a singleton :class:`GpuEnvironment`.
128
129	This provides an OpenCL context and one queue per device.
130	"""
131	if ENV is None:
132	if not HAVE_CUDA:
133	raise RuntimeError("CUDA startup failed with ***"
134	+ CUDA_ERROR + "***; using C compiler instead")
135	reset_environment()
136	if ENV is None:
137	raise RuntimeError("SAS_OPENCL=None in environment")
138	return ENV
139
140	def has_type(dtype):
141	# type: (np.dtype) -> bool
142	"""
143	Return true if device supports the requested precision.
144	"""
145	# Assume the nvidia card supports 32-bit and 64-bit floats.
146	# TODO: check if pycuda support F16
147	return dtype in (generate.F32, generate.F64)
148
149
150	FUNCTION_PATTERN = re.compile(r"""^
151	(?P<space>\s*) # initial space
152	(?P<qualifiers>^(?:\s\b\w+\b\s)+) # one or more qualifiers before function
153	(?P<function>\s\b\w+\b\s[(]) # function name plus open parens
154	""", re.VERBOSE\|re.MULTILINE)
155
156	MARKED_PATTERN = re.compile(r"""
157	\b(return\|else\|kernel\|device\|__device__)\b
158	""", re.VERBOSE\|re.MULTILINE)
159
160	def _add_device_tag(match):
161	# type: (None) -> str
162	# Note: should be re.Match, but that isn't a simple type
163	"""
164	replace qualifiers with __device__ qualifiers if needed
165	"""
166	qualifiers = match.group("qualifiers")
167	if MARKED_PATTERN.search(qualifiers):
168	start, end = match.span()
169	return match.string[start:end]
170	else:
171	function = match.group("function")
172	space = match.group("space")
173	return "".join((space, "__device__ ", qualifiers, function))
174
175	def mark_device_functions(source):
176	# type: (str) -> str
177	"""
178	Mark all function declarations as __device__ functions (except kernel).
179	"""
180	return FUNCTION_PATTERN.sub(_add_device_tag, source)
181
182	def show_device_functions(source):
183	# type: (str) -> str
184	"""
185	Show all discovered function declarations, but don't change any.
186	"""
187	for match in FUNCTION_PATTERN.finditer(source):
188	print(match.group('qualifiers').replace('\n',r'\n'), match.group('function'), '(')
189	return source
190
191	def compile_model(source, dtype, fast=False):
192	# type: (str, np.dtype, bool) -> SourceModule
193	"""
194	Build a model to run on the gpu.
195
196	Returns the compiled program and its type. The returned type will
197	be float32 even if the desired type is float64 if any of the
198	devices in the context do not support the cl_khr_fp64 extension.
199	"""
200	dtype = np.dtype(dtype)
201	if not has_type(dtype):
202	raise RuntimeError("%s not supported for devices"%dtype)
203
204	source_list = [generate.convert_type(source, dtype)]
205
206	source_list.insert(0, "#define USE_SINCOS\n")
207	source = "\n".join(source_list)
208	#source = show_device_functions(source)
209	source = mark_device_functions(source)
210	#with open('/tmp/kernel.cu', 'w') as fd: fd.write(source)
211	#print(source)
212	#options = ['--verbose', '-E']
213	options = ['--use_fast_math'] if fast else None
214	program = SourceModule(source, no_extern_c=True, options=options) # include_dirs=[...]
215
216	#print("done with "+program)
217	return program
218
219
220	# for now, this returns one device in the context
221	# TODO: create a context that contains all devices on all platforms
222	class GpuEnvironment(object):
223	"""
224	GPU context, with possibly many devices, and one queue per device.
225	"""
226	context = None # type: cuda.Context
227	def __init__(self, devnum=None):
228	# type: (int) -> None
229	# Byte boundary for data alignment
230	#self.data_boundary = max(d.min_data_type_align_size
231	# for d in self.context.devices)
232	self.compiled = {}
233	env = os.environ.get("SAS_OPENCL", "").lower()
234	if devnum is None and env.startswith("cuda:"):
235	devnum = int(env[5:])
236	# Set the global context to the particular device number if one is
237	# given, otherwise use the default context. Perhaps this will be set
238	# by an environment variable within autoinit.
239	if devnum is not None:
240	self.context = cuda.Device(devnum).make_context()
241	else:
242	self.context = make_default_context()
243
244	def release(self):
245	if self.context is not None:
246	self.context.pop()
247	self.context = None
248
249	def __del__(self):
250	self.release()
251
252	def has_type(self, dtype):
253	# type: (np.dtype) -> bool
254	"""
255	Return True if all devices support a given type.
256	"""
257	return has_type(dtype)
258
259	def compile_program(self, name, source, dtype, fast, timestamp):
260	# type: (str, str, np.dtype, bool, float) -> cl.Program
261	"""
262	Compile the program for the device in the given context.
263	"""
264	# Note: PyOpenCL caches based on md5 hash of source, options and device
265	# so we don't really need to cache things for ourselves. I'll do so
266	# anyway just to save some data munging time.
267	tag = generate.tag_source(source)
268	key = "%s-%s-%s%s"%(name, dtype, tag, ("-fast" if fast else ""))
269	# Check timestamp on program
270	program, program_timestamp = self.compiled.get(key, (None, np.inf))
271	if program_timestamp < timestamp:
272	del self.compiled[key]
273	if key not in self.compiled:
274	logging.info("building %s for CUDA", key)
275	program = compile_model(str(source), dtype, fast)
276	self.compiled[key] = (program, timestamp)
277	return program
278
279	class GpuModel(KernelModel):
280	"""
281	GPU wrapper for a single model.
282
283	source and model_info are the model source and interface as returned
284	from :func:`generate.make_source` and :func:`generate.make_model_info`.
285
286	dtype is the desired model precision. Any numpy dtype for single
287	or double precision floats will do, such as 'f', 'float32' or 'single'
288	for single and 'd', 'float64' or 'double' for double. Double precision
289	is an optional extension which may not be available on all devices.
290	Half precision ('float16','half') may be available on some devices.
291	Fast precision ('fast') is a loose version of single precision, indicating
292	that the compiler is allowed to take shortcuts.
293	"""
294	info = None # type: ModelInfo
295	source = "" # type: str
296	dtype = None # type: np.dtype
297	fast = False # type: bool
298	program = None # type: SourceModule
299	_kernels = None # type: List[cuda.Function]
300
301	def __init__(self, source, model_info, dtype=generate.F32, fast=False):
302	# type: (Dict[str,str], ModelInfo, np.dtype, bool) -> None
303	self.info = model_info
304	self.source = source
305	self.dtype = dtype
306	self.fast = fast
307	self.program = None # delay program creation
308	self._kernels = None
309
310	def __getstate__(self):
311	# type: () -> Tuple[ModelInfo, str, np.dtype, bool]
312	return self.info, self.source, self.dtype, self.fast
313
314	def __setstate__(self, state):
315	# type: (Tuple[ModelInfo, str, np.dtype, bool]) -> None
316	self.info, self.source, self.dtype, self.fast = state
317	self.program = None
318
319	def make_kernel(self, q_vectors):
320	# type: (List[np.ndarray]) -> "GpuKernel"
321	if self.program is None:
322	compile_program = environment().compile_program
323	timestamp = generate.ocl_timestamp(self.info)
324	self.program = compile_program(
325	self.info.name,
326	self.source['opencl'],
327	self.dtype,
328	self.fast,
329	timestamp)
330	variants = ['Iq', 'Iqxy', 'Imagnetic']
331	names = [generate.kernel_name(self.info, k) for k in variants]
332	kernels = [self.program.get_function(k) for k in names]
333	self._kernels = dict((k, v) for k, v in zip(variants, kernels))
334	is_2d = len(q_vectors) == 2
335	if is_2d:
336	kernel = [self._kernels['Iqxy'], self._kernels['Imagnetic']]
337	else:
338	kernel = [self._kernels['Iq']]*2
339	return GpuKernel(kernel, self.dtype, self.info, q_vectors)
340
341	def release(self):
342	# type: () -> None
343	"""
344	Free the resources associated with the model.
345	"""
346	if self.program is not None:
347	self.program = None
348
349	def __del__(self):
350	# type: () -> None
351	self.release()
352
353	# TODO: check that we don't need a destructor for buffers which go out of scope
354	class GpuInput(object):
355	"""
356	Make q data available to the gpu.
357
358	q_vectors is a list of q vectors, which will be [q] for 1-D data,
359	and [qx, qy] for 2-D data. Internally, the vectors will be reallocated
360	to get the best performance on OpenCL, which may involve shifting and
361	stretching the array to better match the memory architecture. Additional
362	points will be evaluated with q=1e-3.
363
364	dtype is the data type for the q vectors. The data type should be
365	set to match that of the kernel, which is an attribute of
366	:class:`GpuProgram`. Note that not all kernels support double
367	precision, so even if the program was created for double precision,
368	the GpuProgram.dtype may be single precision.
369
370	Call :meth:`release` when complete. Even if not called directly, the
371	buffer will be released when the data object is freed.
372	"""
373	def __init__(self, q_vectors, dtype=generate.F32):
374	# type: (List[np.ndarray], np.dtype) -> None
375	# TODO: do we ever need double precision q?
376	self.nq = q_vectors[0].size
377	self.dtype = np.dtype(dtype)
378	self.is_2d = (len(q_vectors) == 2)
379	# TODO: stretch input based on get_warp()
380	# not doing it now since warp depends on kernel, which is not known
381	# at this point, so instead using 32, which is good on the set of
382	# architectures tested so far.
383	if self.is_2d:
384	# Note: 16 rather than 15 because result is 1 longer than input.
385	width = ((self.nq+16)//16)*16
386	self.q = np.empty((width, 2), dtype=dtype)
387	self.q[:self.nq, 0] = q_vectors[0]
388	self.q[:self.nq, 1] = q_vectors[1]
389	else:
390	# Note: 32 rather than 31 because result is 1 longer than input.
391	width = ((self.nq+32)//32)*32
392	self.q = np.empty(width, dtype=dtype)
393	self.q[:self.nq] = q_vectors[0]
394	self.global_size = [self.q.shape[0]]
395	#print("creating inputs of size", self.global_size)
396	self.q_b = cuda.to_device(self.q)
397
398	def release(self):
399	# type: () -> None
400	"""
401	Free the memory.
402	"""
403	if self.q_b is not None:
404	self.q_b.free()
405	self.q_b = None
406
407	def __del__(self):
408	# type: () -> None
409	self.release()
410
411	class GpuKernel(Kernel):
412	"""
413	Callable SAS kernel.
414
415	kernel is the GpuKernel object to call
416
417	model_info is the module information
418
419	q_vectors is the q vectors at which the kernel should be evaluated
420
421	dtype is the kernel precision
422
423	The resulting call method takes the pars, a list of values for
424	the fixed parameters to the kernel, and pd_pars, a list of (value,weight)
425	vectors for the polydisperse parameters. cutoff determines the
426	integration limits: any points with combined weight less than cutoff
427	will not be calculated.
428
429	Call :meth:`release` when done with the kernel instance.
430	"""
431	def __init__(self, kernel, dtype, model_info, q_vectors):
432	# type: (cl.Kernel, np.dtype, ModelInfo, List[np.ndarray]) -> None
433	self.q_input = GpuInput(q_vectors, dtype)
434	self.kernel = kernel
435	# F16 isn't sufficient, so don't support it
436	self._as_dtype = np.float64 if dtype == generate.F64 else np.float32
437
438	# attributes accessed from the outside
439	self.dim = '2d' if self.q_input.is_2d else '1d'
440	self.info = model_info
441	self.dtype = dtype
442
443	# holding place for the returned value
444	nout = 2 if self.info.have_Fq and self.dim == '1d' else 1
445	extra_q = 4 # total weight, form volume, shell volume and R_eff
446	self.result = np.empty(self.q_input.nq*nout+extra_q, dtype)
447
448	# Inputs and outputs for each kernel call
449	# Note: res may be shorter than res_b if global_size != nq
450	width = ((self.result.size+31)//32)32 self.dtype.itemsize
451	self.result_b = cuda.mem_alloc(width)
452	self._need_release = [self.result_b]
453
454	def _call_kernel(self, call_details, values, cutoff, magnetic, effective_radius_type):
455	# type: (CallDetails, np.ndarray, np.ndarray, float, bool) -> np.ndarray
456	# Arrange data transfer to card
457	details_b = cuda.to_device(call_details.buffer)
458	values_b = cuda.to_device(values)
459
460	kernel = self.kernel[1 if magnetic else 0]
461	args = [
462	np.uint32(self.q_input.nq), None, None,
463	details_b, values_b, self.q_input.q_b, self.result_b,
464	self._as_dtype(cutoff),
465	np.uint32(effective_radius_type),
466	]
467	grid = partition(self.q_input.nq)
468	#print("Calling OpenCL")
469	#call_details.show(values)
470	# Call kernel and retrieve results
471	last_nap = time.clock()
472	step = 100000000//self.q_input.nq + 1
473	#step = 1000000000
474	for start in range(0, call_details.num_eval, step):
475	stop = min(start + step, call_details.num_eval)
476	#print("queuing",start,stop)
477	args[1:3] = [np.int32(start), np.int32(stop)]
478	kernel(args, *grid)
479	if stop < call_details.num_eval:
480	sync()
481	# Allow other processes to run
482	current_time = time.clock()
483	if current_time - last_nap > 0.5:
484	time.sleep(0.001)
485	last_nap = current_time
486	sync()
487	cuda.memcpy_dtoh(self.result, self.result_b)
488	#print("result", self.result)
489
490	details_b.free()
491	values_b.free()
492
493	def release(self):
494	# type: () -> None
495	"""
496	Release resources associated with the kernel.
497	"""
498	for p in self._need_release:
499	p.free()
500	self._need_release = []
501
502	def __del__(self):
503	# type: () -> None
504	self.release()
505
506
507	def sync():
508	"""
509	Overview:
510	Waits for operation in the current context to complete.
511
512	Note: Maybe context.synchronize() is sufficient.
513	"""
514	#return # The following works in C++; don't know what pycuda is doing
515	# Create an event with which to synchronize
516	done = cuda.Event()
517
518	# Schedule an event trigger on the GPU.
519	done.record()
520
521	#line added to not hog resources
522	while not done.query():
523	time.sleep(0.01)
524
525	# Block until the GPU executes the kernel.
526	done.synchronize()
527	# Clean up the event; I don't think they can be reused.
528	del done
529
530
531	def partition(n):
532	'''
533	Overview:
534	Auto grids the thread blocks to achieve some level of calculation
535	efficiency.
536	'''
537	max_gx, max_gy = 65535, 65535
538	blocksize = 32
539	#max_gx, max_gy = 5, 65536
540	#blocksize = 3
541	block = (blocksize, 1, 1)
542	num_blocks = int((n+blocksize-1)/blocksize)
543	if num_blocks < max_gx:
544	grid = (num_blocks, 1)
545	else:
546	gx = max_gx
547	gy = (num_blocks + max_gx - 1) / max_gx
548	if gy >= max_gy:
549	raise ValueError("vector is too large")
550	grid = (gx, gy)
551	#print("block", block, "grid", grid)
552	#print("waste", block[0]block[1]block[2]grid[0]grid[1] - n)
553	return dict(block=block, grid=grid)

Note: See TracBrowser for help on using the repository browser.

SasView

source: sasmodels/sasmodels/kernelcuda.py @ f872fd1

Download in other formats: