kernelcl.py @ d1fe925

gh-pages

Last change on this file since d1fe925 was d1fe925, checked in by ajj, 9 years ago
Creating gh_pages branch for docs
Property mode set to `100644`
File size: 13.3 KB

Rev	Line
[d1fe925]	1	"""
	2	GPU support through OpenCL
	3
	4	There should be a single GPU environment running on the system. This
	5	environment is constructed on the first call to :func:`env`, and the
	6	same environment is returned on each call.
	7
	8	After retrieving the environment, the next step is to create the kernel.
	9	This is done with a call to :meth:`GpuEnvironment.make_kernel`, which
	10	returns the type of data used by the kernel.
	11
	12	Next a :class:`GpuData` object should be created with the correct kind
	13	of data. This data object can be used by multiple kernels, for example,
	14	if the target model is a weighted sum of multiple kernels. The data
	15	should include any extra evaluation points required to compute the proper
	16	data smearing. This need not match the square grid for 2D data if there
	17	is an index saying which q points are active.
	18
	19	Together the GpuData, the program, and a device form a :class:`GpuKernel`.
	20	This kernel is used during fitting, receiving new sets of parameters and
	21	evaluating them. The output value is stored in an output buffer on the
	22	devices, where it can be combined with other structure factors and form
	23	factors and have instrumental resolution effects applied.
	24	"""
	25	import os
	26	import warnings
	27
	28	import numpy as np
	29
	30	try:
	31	import pyopencl as cl
	32	# Ask OpenCL for the default context so that we know that one exists
	33	cl.create_some_context(interactive=False)
	34	except Exception, exc:
	35	warnings.warn(str(exc))
	36	raise RuntimeError("OpenCL not available")
	37
	38	from pyopencl import mem_flags as mf
	39
	40	from . import generate
	41
	42	F64_DEFS = """\
	43	#ifdef cl_khr_fp64
	44	# pragma OPENCL EXTENSION cl_khr_fp64: enable
	45	#endif
	46	"""
	47
	48	# The max loops number is limited by the amount of local memory available
	49	# on the device. You don't want to make this value too big because it will
	50	# waste resources, nor too small because it may interfere with users trying
	51	# to do their polydispersity calculations. A value of 1024 should be much
	52	# larger than necessary given that cost grows as npts^k where k is the number
	53	# of polydisperse parameters.
	54	MAX_LOOPS = 2048
	55
	56
	57	ENV = None
	58	def environment():
	59	"""
	60	Returns a singleton :class:`GpuEnvironment`.
	61
	62	This provides an OpenCL context and one queue per device.
	63	"""
	64	global ENV
	65	if ENV is None:
	66	ENV = GpuEnvironment()
	67	return ENV
	68
	69	def has_double(device):
	70	"""
	71	Return true if device supports double precision.
	72	"""
	73	return "cl_khr_fp64" in device.extensions
	74
	75	def get_warp(kernel, queue):
	76	"""
	77	Return the size of an execution batch for kernel running on queue.
	78	"""
	79	return kernel.get_work_group_info(
	80	cl.kernel_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
	81	queue.device)
	82
	83	def _stretch_input(vector, dtype, extra=1e-3, boundary=32):
	84	"""
	85	Stretch an input vector to the correct boundary.
	86
	87	Performance on the kernels can drop by a factor of two or more if the
	88	number of values to compute does not fall on a nice power of two
	89	boundary. The trailing additional vector elements are given a
	90	value of extra, and so f(extra) will be computed for each of
	91	them. The returned array will thus be a subset of the computed array.
	92
	93	boundary should be a power of 2 which is at least 32 for good
	94	performance on current platforms (as of Jan 2015). It should
	95	probably be the max of get_warp(kernel,queue) and
	96	device.min_data_type_align_size//4.
	97	"""
	98	remainder = vector.size % boundary
	99	if remainder != 0:
	100	size = vector.size + (boundary - remainder)
	101	vector = np.hstack((vector, [extra] * (size - vector.size)))
	102	return np.ascontiguousarray(vector, dtype=dtype)
	103
	104
	105	def compile_model(context, source, dtype):
	106	"""
	107	Build a model to run on the gpu.
	108
	109	Returns the compiled program and its type. The returned type will
	110	be float32 even if the desired type is float64 if any of the
	111	devices in the context do not support the cl_khr_fp64 extension.
	112	"""
	113	dtype = np.dtype(dtype)
	114	if dtype == generate.F64 and not all(has_double(d) for d in context.devices):
	115	raise RuntimeError("Double precision not supported for devices")
	116
	117	header = F64_DEFS if dtype == generate.F64 else ""
	118	if dtype == generate.F32:
	119	source = generate.use_single(source)
	120	# Note: USE_SINCOS makes the intel cpu slower under opencl
	121	if context.devices[0].type == cl.device_type.GPU:
	122	header += "#define USE_SINCOS\n"
	123	program = cl.Program(context, header + source).build()
	124	return program
	125
	126
	127	def make_result(self, size):
	128	self.res = np.empty(size, dtype=self.dtype)
	129	self.res_b = cl.Buffer(self.program.context, mf.READ_WRITE, self.res.nbytes)
	130	return self.res, self.res_b
	131
	132
	133	# for now, this returns one device in the context
	134	# TODO: create a context that contains all devices on all platforms
	135	class GpuEnvironment(object):
	136	"""
	137	GPU context, with possibly many devices, and one queue per device.
	138	"""
	139	def __init__(self):
	140	# find gpu context
	141	#self.context = cl.create_some_context()
	142
	143	self.context = None
	144	if 'PYOPENCL_CTX' in os.environ:
	145	self._create_some_context()
	146
	147	if not self.context:
	148	self.context = _get_default_context()
	149
	150	# Byte boundary for data alignment
	151	#self.data_boundary = max(d.min_data_type_align_size
	152	# for d in self.context.devices)
	153	self.queues = [cl.CommandQueue(self.context, d)
	154	for d in self.context.devices]
	155	self.has_double = all(has_double(d) for d in self.context.devices)
	156	self.compiled = {}
	157
	158	def _create_some_context(self):
	159	try:
	160	self.context = cl.create_some_context(interactive=False)
	161	except Exception, exc:
	162	warnings.warn(str(exc))
	163	warnings.warn("pyopencl.create_some_context() failed")
	164	warnings.warn("the environment variable 'PYOPENCL_CTX' might not be set correctly")
	165
	166	def compile_program(self, name, source, dtype):
	167	if name not in self.compiled:
	168	#print "compiling",name
	169	self.compiled[name] = compile_model(self.context, source, dtype)
	170	return self.compiled[name]
	171
	172	def release_program(self, name):
	173	if name in self.compiled:
	174	self.compiled[name].release()
	175	del self.compiled[name]
	176
	177	def _get_default_context():
	178	default = None
	179	for platform in cl.get_platforms():
	180	for device in platform.get_devices():
	181	if device.type == cl.device_type.GPU:
	182	return cl.Context([device])
	183	if default is None:
	184	default = device
	185
	186	if not default:
	187	raise RuntimeError("OpenCL device not found")
	188
	189	return cl.Context([default])
	190
	191
	192	class GpuModel(object):
	193	"""
	194	GPU wrapper for a single model.
	195
	196	source and info are the model source and interface as returned
	197	from :func:`gen.make`.
	198
	199	dtype is the desired model precision. Any numpy dtype for single
	200	or double precision floats will do, such as 'f', 'float32' or 'single'
	201	for single and 'd', 'float64' or 'double' for double. Double precision
	202	is an optional extension which may not be available on all devices.
	203	"""
	204	def __init__(self, source, info, dtype=generate.F32):
	205	self.info = info
	206	self.source = source
	207	self.dtype = np.dtype(dtype)
	208	self.program = None # delay program creation
	209
	210	def __getstate__(self):
	211	state = self.__dict__.copy()
	212	state['program'] = None
	213	return state
	214
	215	def __setstate__(self, state):
	216	self.__dict__ = state.copy()
	217
	218	def __call__(self, q_input):
	219	if self.dtype != q_input.dtype:
	220	raise TypeError("data is %s kernel is %s" % (q_input.dtype, self.dtype))
	221	if self.program is None:
	222	compiler = environment().compile_program
	223	self.program = compiler(self.info['name'], self.source, self.dtype)
	224	kernel_name = generate.kernel_name(self.info, q_input.is_2D)
	225	kernel = getattr(self.program, kernel_name)
	226	return GpuKernel(kernel, self.info, q_input)
	227
	228	def release(self):
	229	if self.program is not None:
	230	environment().release_program(self.info['name'])
	231	self.program = None
	232
	233	def make_input(self, q_vectors):
	234	"""
	235	Make q input vectors available to the model.
	236
	237	Note that each model needs its own q vector even if the case of
	238	mixture models because some models may be OpenCL, some may be
	239	ctypes and some may be pure python.
	240	"""
	241	return GpuInput(q_vectors, dtype=self.dtype)
	242
	243	# TODO: check that we don't need a destructor for buffers which go out of scope
	244	class GpuInput(object):
	245	"""
	246	Make q data available to the gpu.
	247
	248	q_vectors is a list of q vectors, which will be [q] for 1-D data,
	249	and [qx, qy] for 2-D data. Internally, the vectors will be reallocated
	250	to get the best performance on OpenCL, which may involve shifting and
	251	stretching the array to better match the memory architecture. Additional
	252	points will be evaluated with q=1e-3.
	253
	254	dtype is the data type for the q vectors. The data type should be
	255	set to match that of the kernel, which is an attribute of
	256	:class:`GpuProgram`. Note that not all kernels support double
	257	precision, so even if the program was created for double precision,
	258	the GpuProgram.dtype may be single precision.
	259
	260	Call :meth:`release` when complete. Even if not called directly, the
	261	buffer will be released when the data object is freed.
	262	"""
	263	def __init__(self, q_vectors, dtype=generate.F32):
	264	env = environment()
	265	self.nq = q_vectors[0].size
	266	self.dtype = np.dtype(dtype)
	267	self.is_2D = (len(q_vectors) == 2)
	268	# TODO: stretch input based on get_warp()
	269	# not doing it now since warp depends on kernel, which is not known
	270	# at this point, so instead using 32, which is good on the set of
	271	# architectures tested so far.
	272	self.q_vectors = [_stretch_input(q, self.dtype, 32) for q in q_vectors]
	273	self.q_buffers = [
	274	cl.Buffer(env.context, mf.READ_ONLY \| mf.COPY_HOST_PTR, hostbuf=q)
	275	for q in self.q_vectors
	276	]
	277	self.global_size = [self.q_vectors[0].size]
	278
	279	def release(self):
	280	for b in self.q_buffers:
	281	b.release()
	282	self.q_buffers = []
	283
	284	class GpuKernel(object):
	285	"""
	286	Callable SAS kernel.
	287
	288	kernel is the GpuKernel object to call.
	289
	290	info is the module information
	291
	292	q_input is the DllInput q vectors at which the kernel should be
	293	evaluated.
	294
	295	The resulting call method takes the pars, a list of values for
	296	the fixed parameters to the kernel, and pd_pars, a list of (value,weight)
	297	vectors for the polydisperse parameters. cutoff determines the
	298	integration limits: any points with combined weight less than cutoff
	299	will not be calculated.
	300
	301	Call :meth:`release` when done with the kernel instance.
	302	"""
	303	def __init__(self, kernel, info, q_input):
	304	self.q_input = q_input
	305	self.kernel = kernel
	306	self.info = info
	307	self.res = np.empty(q_input.nq, q_input.dtype)
	308	dim = '2d' if q_input.is_2D else '1d'
	309	self.fixed_pars = info['partype']['fixed-' + dim]
	310	self.pd_pars = info['partype']['pd-' + dim]
	311
	312	# Inputs and outputs for each kernel call
	313	# Note: res may be shorter than res_b if global_size != nq
	314	env = environment()
	315	self.loops_b = [cl.Buffer(env.context, mf.READ_WRITE,
	316	2 * MAX_LOOPS * q_input.dtype.itemsize)
	317	for _ in env.queues]
	318	self.res_b = [cl.Buffer(env.context, mf.READ_WRITE,
	319	q_input.global_size[0] * q_input.dtype.itemsize)
	320	for _ in env.queues]
	321
	322
	323	def __call__(self, fixed_pars, pd_pars, cutoff=1e-5):
	324	real = np.float32 if self.q_input.dtype == generate.F32 else np.float64
	325
	326	device_num = 0
	327	queuei = environment().queues[device_num]
	328	res_bi = self.res_b[device_num]
	329	nq = np.uint32(self.q_input.nq)
	330	if pd_pars:
	331	cutoff = real(cutoff)
	332	loops_N = [np.uint32(len(p[0])) for p in pd_pars]
	333	loops = np.hstack(pd_pars) \
	334	if pd_pars else np.empty(0, dtype=self.q_input.dtype)
	335	loops = np.ascontiguousarray(loops.T, self.q_input.dtype).flatten()
	336	#print "loops",Nloops, loops
	337
	338	#import sys; print >>sys.stderr,"opencl eval",pars
	339	#print "opencl eval",pars
	340	if len(loops) > 2 * MAX_LOOPS:
	341	raise ValueError("too many polydispersity points")
	342
	343	loops_bi = self.loops_b[device_num]
	344	cl.enqueue_copy(queuei, loops_bi, loops)
	345	loops_l = cl.LocalMemory(len(loops.data))
	346	#ctx = environment().context
	347	#loops_bi = cl.Buffer(ctx, mf.READ_ONLY\|mf.COPY_HOST_PTR, hostbuf=loops)
	348	dispersed = [loops_bi, loops_l, cutoff] + loops_N
	349	else:
	350	dispersed = []
	351	fixed = [real(p) for p in fixed_pars]
	352	args = self.q_input.q_buffers + [res_bi, nq] + dispersed + fixed
	353	self.kernel(queuei, self.q_input.global_size, None, *args)
	354	cl.enqueue_copy(queuei, self.res, res_bi)
	355
	356	return self.res
	357
	358	def release(self):
	359	for b in self.loops_b:
	360	b.release()
	361	self.loops_b = []
	362	for b in self.res_b:
	363	b.release()
	364	self.res_b = []
	365
	366	def __del__(self):
	367	self.release()

Note: See TracBrowser for help on using the repository browser.

SasView

source: sasmodels/sasmodels/kernelcl.py @ d1fe925

Download in other formats: