kernelcl.py @ 250fa25

core_shell_microgelscostrafo411magnetic_modelrelease_v0.94release_v0.95ticket-1257-vesicle-productticket_1156ticket_1265_superballticket_822_more_unit_tests

Last change on this file since 250fa25 was 250fa25, checked in by pkienzle, 9 years ago
autoselect gpu for opencl if available
Property mode set to `100644`
File size: 13.9 KB

Rev	Line
[14de349]	1	"""
	2	GPU support through OpenCL
	3
	4	There should be a single GPU environment running on the system. This
	5	environment is constructed on the first call to :func:`env`, and the
	6	same environment is returned on each call.
	7
	8	After retrieving the environment, the next step is to create the kernel.
	9	This is done with a call to :meth:`GpuEnvironment.make_kernel`, which
	10	returns the type of data used by the kernel.
	11
	12	Next a :class:`GpuData` object should be created with the correct kind
	13	of data. This data object can be used by multiple kernels, for example,
	14	if the target model is a weighted sum of multiple kernels. The data
	15	should include any extra evaluation points required to compute the proper
	16	data smearing. This need not match the square grid for 2D data if there
	17	is an index saying which q points are active.
	18
	19	Together the GpuData, the program, and a device form a :class:`GpuKernel`.
	20	This kernel is used during fitting, receiving new sets of parameters and
	21	evaluating them. The output value is stored in an output buffer on the
	22	devices, where it can be combined with other structure factors and form
	23	factors and have instrumental resolution effects applied.
	24	"""
[250fa25]	25	import os
	26	import warnings
	27
[14de349]	28	import numpy as np
[b3f6bc3]	29
[250fa25]	30	try:
	31	import pyopencl as cl
	32	except ImportError,exc:
	33	warnings.warn(str(exc))
	34	raise RuntimeError("OpenCL not available")
	35
[14de349]	36	from pyopencl import mem_flags as mf
	37
[cb6ecf4]	38	from . import generate
[f786ff3]	39	from .kernelpy import PyInput, PyKernel
[14de349]	40
	41	F64_DEFS = """\
[d087487b]	42	#ifdef cl_khr_fp64
	43	# pragma OPENCL EXTENSION cl_khr_fp64: enable
	44	#endif
[14de349]	45	"""
	46
[ce27e21]	47	# The max loops number is limited by the amount of local memory available
	48	# on the device. You don't want to make this value too big because it will
	49	# waste resources, nor too small because it may interfere with users trying
	50	# to do their polydispersity calculations. A value of 1024 should be much
	51	# larger than necessary given that cost grows as npts^k where k is the number
	52	# of polydisperse parameters.
[5d4777d]	53	MAX_LOOPS = 2048
	54
	55	def load_model(kernel_module, dtype="single"):
	56	"""
	57	Load the OpenCL model defined by kernel_module.
	58
	59	Access to the OpenCL device is delayed until the kernel is called
	60	so models can be defined without using too many resources.
	61	"""
[cb6ecf4]	62	source, info = generate.make(kernel_module)
[5d4777d]	63	## for debugging, save source to a .cl file, edit it, and reload as model
[f4cf580]	64	#open(info['name']+'.cl','w').write(source)
[5d4777d]	65	#source = open(info['name']+'.cl','r').read()
	66	return GpuModel(source, info, dtype)
[ce27e21]	67
[14de349]	68	ENV = None
	69	def environment():
	70	"""
	71	Returns a singleton :class:`GpuEnvironment`.
	72
	73	This provides an OpenCL context and one queue per device.
	74	"""
	75	global ENV
	76	if ENV is None:
	77	ENV = GpuEnvironment()
	78	return ENV
	79
	80	def has_double(device):
	81	"""
	82	Return true if device supports double precision.
	83	"""
	84	return "cl_khr_fp64" in device.extensions
	85
[f5b9a6b]	86	def get_warp(kernel, queue):
	87	"""
	88	Return the size of an execution batch for kernel running on queue.
	89	"""
	90	return kernel.get_work_group_info(cl.kernel_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
	91	queue.device)
[14de349]	92
[f5b9a6b]	93	def _stretch_input(vector, dtype, extra=1e-3, boundary=32):
[14de349]	94	"""
	95	Stretch an input vector to the correct boundary.
	96
	97	Performance on the kernels can drop by a factor of two or more if the
	98	number of values to compute does not fall on a nice power of two
[f5b9a6b]	99	boundary. The trailing additional vector elements are given a
	100	value of extra, and so f(extra) will be computed for each of
	101	them. The returned array will thus be a subset of the computed array.
	102
	103	boundary should be a power of 2 which is at least 32 for good
	104	performance on current platforms (as of Jan 2015). It should
	105	probably be the max of get_warp(kernel,queue) and
	106	device.min_data_type_align_size//4.
	107	"""
[14de349]	108	remainder = vector.size%boundary
[f5b9a6b]	109	if remainder != 0:
	110	size = vector.size + (boundary - remainder)
[14de349]	111	vector = np.hstack((vector, [extra]*(size-vector.size)))
	112	return np.ascontiguousarray(vector, dtype=dtype)
	113
	114
	115	def compile_model(context, source, dtype):
	116	"""
	117	Build a model to run on the gpu.
	118
	119	Returns the compiled program and its type. The returned type will
	120	be float32 even if the desired type is float64 if any of the
	121	devices in the context do not support the cl_khr_fp64 extension.
	122	"""
	123	dtype = np.dtype(dtype)
[cb6ecf4]	124	if dtype==generate.F64 and not all(has_double(d) for d in context.devices):
[ce27e21]	125	raise RuntimeError("Double precision not supported for devices")
[14de349]	126
[cb6ecf4]	127	header = F64_DEFS if dtype == generate.F64 else ""
	128	if dtype == generate.F32:
	129	source = generate.use_single(source)
[14de349]	130	# Note: USE_SINCOS makes the intel cpu slower under opencl
	131	if context.devices[0].type == cl.device_type.GPU:
	132	header += "#define USE_SINCOS\n"
	133	program = cl.Program(context, header+source).build()
[ce27e21]	134	return program
[14de349]	135
	136
	137	def make_result(self, size):
	138	self.res = np.empty(size, dtype=self.dtype)
	139	self.res_b = cl.Buffer(self.program.context, mf.READ_WRITE, self.res.nbytes)
	140	return self.res, self.res_b
	141
	142
	143	# for now, this returns one device in the context
	144	# TODO: create a context that contains all devices on all platforms
	145	class GpuEnvironment(object):
	146	"""
	147	GPU context, with possibly many devices, and one queue per device.
	148	"""
	149	def __init__(self):
[250fa25]	150	# find gpu context
	151	#self.context = cl.create_some_context()
	152
	153	self.context = None
	154	if 'PYOPENCL_CTX' in os.environ:
	155	self._create_some_context()
	156
	157	if not self.context:
	158	self.context = self._find_context()
	159
[f5b9a6b]	160	# Byte boundary for data alignment
	161	#self.data_boundary = max(d.min_data_type_align_size
	162	# for d in self.context.devices)
[250fa25]	163	self.queues = [cl.CommandQueue(self.context, d)
	164	for d in self.context.devices]
[ce27e21]	165	self.has_double = all(has_double(d) for d in self.context.devices)
	166	self.compiled = {}
	167
[250fa25]	168	def _create_some_context(self):
	169	try:
	170	self.context = cl.create_some_context(interactive=False)
	171	except Exception,exc:
	172	warnings.warn(str(exc))
	173	warnings.warn("pyopencl.create_some_context() failed")
	174	warnings.warn("the environment variable 'PYOPENCL_CTX' might not be set correctly")
	175
	176	def _find_context(self):
	177	default = None
	178	for platform in cl.get_platforms():
	179	for device in platform.get_devices():
	180	if device.type == cl.device_type.GPU:
	181	return cl.Context([device])
	182	if default is None:
	183	default = device
	184
	185	if not default:
	186	raise RuntimeError("OpenCL device not found")
	187
	188	return cl.Context([default])
	189
[ce27e21]	190	def compile_program(self, name, source, dtype):
	191	if name not in self.compiled:
	192	#print "compiling",name
	193	self.compiled[name] = compile_model(self.context, source, dtype)
	194	return self.compiled[name]
	195
	196	def release_program(self, name):
	197	if name in self.compiled:
	198	self.compiled[name].release()
	199	del self.compiled[name]
[14de349]	200
[250fa25]	201
[14de349]	202	class GpuModel(object):
	203	"""
	204	GPU wrapper for a single model.
	205
[ce27e21]	206	source and info are the model source and interface as returned
[14de349]	207	from :func:`gen.make`.
	208
	209	dtype is the desired model precision. Any numpy dtype for single
	210	or double precision floats will do, such as 'f', 'float32' or 'single'
	211	for single and 'd', 'float64' or 'double' for double. Double precision
	212	is an optional extension which may not be available on all devices.
	213	"""
[cb6ecf4]	214	def __init__(self, source, info, dtype=generate.F32):
[ce27e21]	215	self.info = info
	216	self.source = source
	217	self.dtype = dtype
	218	self.program = None # delay program creation
[14de349]	219
[ce27e21]	220	def __getstate__(self):
	221	state = self.__dict__.copy()
	222	state['program'] = None
	223	return state
[14de349]	224
[ce27e21]	225	def __setstate__(self, state):
	226	self.__dict__ = state.copy()
	227
	228	def __call__(self, input):
[b3f6bc3]	229	# Support pure python kernel call
	230	if input.is_2D and callable(self.info['Iqxy']):
	231	return PyKernel(self.info['Iqxy'], self.info, input)
	232	elif not input.is_2D and callable(self.info['Iq']):
	233	return PyKernel(self.info['Iq'], self.info, input)
	234
[14de349]	235	if self.dtype != input.dtype:
	236	raise TypeError("data and kernel have different types")
[ce27e21]	237	if self.program is None:
	238	self.program = environment().compile_program(self.info['name'],self.source, self.dtype)
[cb6ecf4]	239	kernel_name = generate.kernel_name(self.info, input.is_2D)
[14de349]	240	kernel = getattr(self.program, kernel_name)
[ce27e21]	241	return GpuKernel(kernel, self.info, input)
	242
	243	def release(self):
	244	if self.program is not None:
	245	environment().release_program(self.info['name'])
	246	self.program = None
[14de349]	247
	248	def make_input(self, q_vectors):
	249	"""
	250	Make q input vectors available to the model.
	251
[b3f6bc3]	252	Note that each model needs its own q vector even if the case of
	253	mixture models because some models may be OpenCL, some may be
	254	ctypes and some may be pure python.
[14de349]	255	"""
[b3f6bc3]	256	# Support pure python kernel call
	257	if len(q_vectors) == 1 and callable(self.info['Iq']):
	258	return PyInput(q_vectors, dtype=self.dtype)
	259	elif callable(self.info['Iqxy']):
	260	return PyInput(q_vectors, dtype=self.dtype)
	261	else:
	262	return GpuInput(q_vectors, dtype=self.dtype)
[14de349]	263
	264	# TODO: check that we don't need a destructor for buffers which go out of scope
	265	class GpuInput(object):
	266	"""
	267	Make q data available to the gpu.
	268
	269	q_vectors is a list of q vectors, which will be [q] for 1-D data,
	270	and [qx, qy] for 2-D data. Internally, the vectors will be reallocated
	271	to get the best performance on OpenCL, which may involve shifting and
	272	stretching the array to better match the memory architecture. Additional
	273	points will be evaluated with q=1e-3.
	274
	275	dtype is the data type for the q vectors. The data type should be
	276	set to match that of the kernel, which is an attribute of
	277	:class:`GpuProgram`. Note that not all kernels support double
	278	precision, so even if the program was created for double precision,
	279	the GpuProgram.dtype may be single precision.
	280
	281	Call :meth:`release` when complete. Even if not called directly, the
	282	buffer will be released when the data object is freed.
	283	"""
[cb6ecf4]	284	def __init__(self, q_vectors, dtype=generate.F32):
[14de349]	285	env = environment()
	286	self.nq = q_vectors[0].size
	287	self.dtype = np.dtype(dtype)
	288	self.is_2D = (len(q_vectors) == 2)
[f5b9a6b]	289	# TODO: stretch input based on get_warp()
	290	# not doing it now since warp depends on kernel, which is not known
	291	# at this point, so instead using 32, which is good on the set of
	292	# architectures tested so far.
	293	self.q_vectors = [_stretch_input(q, self.dtype, 32) for q in q_vectors]
[14de349]	294	self.q_buffers = [
	295	cl.Buffer(env.context, mf.READ_ONLY \| mf.COPY_HOST_PTR, hostbuf=q)
	296	for q in self.q_vectors
	297	]
	298	self.global_size = [self.q_vectors[0].size]
	299
	300	def release(self):
	301	for b in self.q_buffers:
	302	b.release()
	303	self.q_buffers = []
	304
	305	class GpuKernel(object):
[ff7119b]	306	"""
	307	Callable SAS kernel.
	308
	309	kernel is the GpuKernel object to call.
	310
	311	info is the module information
	312
	313	input is the DllInput q vectors at which the kernel should be
	314	evaluated.
	315
	316	The resulting call method takes the pars, a list of values for
	317	the fixed parameters to the kernel, and pd_pars, a list of (value,weight)
	318	vectors for the polydisperse parameters. cutoff determines the
	319	integration limits: any points with combined weight less than cutoff
	320	will not be calculated.
	321
	322	Call :meth:`release` when done with the kernel instance.
	323	"""
[ce27e21]	324	def __init__(self, kernel, info, input):
[14de349]	325	self.input = input
	326	self.kernel = kernel
[ce27e21]	327	self.info = info
	328	self.res = np.empty(input.nq, input.dtype)
	329	dim = '2d' if input.is_2D else '1d'
	330	self.fixed_pars = info['partype']['fixed-'+dim]
	331	self.pd_pars = info['partype']['pd-'+dim]
[14de349]	332
	333	# Inputs and outputs for each kernel call
[ce27e21]	334	# Note: res may be shorter than res_b if global_size != nq
	335	env = environment()
[14de349]	336	self.loops_b = [cl.Buffer(env.context, mf.READ_WRITE,
[5d4777d]	337	2MAX_LOOPSinput.dtype.itemsize)
[14de349]	338	for _ in env.queues]
	339	self.res_b = [cl.Buffer(env.context, mf.READ_WRITE,
	340	input.global_size[0]*input.dtype.itemsize)
	341	for _ in env.queues]
	342
	343
[ce27e21]	344	def __call__(self, pars, pd_pars, cutoff=1e-5):
[cb6ecf4]	345	real = np.float32 if self.input.dtype == generate.F32 else np.float64
[ce27e21]	346	fixed = [real(p) for p in pars]
	347	cutoff = real(cutoff)
	348	loops = np.hstack(pd_pars)
	349	loops = np.ascontiguousarray(loops.T, self.input.dtype).flatten()
[5d4777d]	350	Nloops = [np.uint32(len(p[0])) for p in pd_pars]
	351	#print "loops",Nloops, loops
[ce27e21]	352
[1780d59]	353	#import sys; print >>sys.stderr,"opencl eval",pars
[ce27e21]	354	#print "opencl eval",pars
[5d4777d]	355	if len(loops) > 2*MAX_LOOPS:
[ce27e21]	356	raise ValueError("too many polydispersity points")
[14de349]	357	device_num = 0
	358	res_bi = self.res_b[device_num]
	359	queuei = environment().queues[device_num]
	360	loops_bi = self.loops_b[device_num]
[ce27e21]	361	loops_l = cl.LocalMemory(len(loops.data))
[14de349]	362	cl.enqueue_copy(queuei, loops_bi, loops)
	363	#ctx = environment().context
	364	#loops_bi = cl.Buffer(ctx, mf.READ_ONLY \| mf.COPY_HOST_PTR, hostbuf=loops)
[5d4777d]	365	args = self.input.q_buffers + [res_bi,loops_bi,loops_l,cutoff] + fixed + Nloops
[ce27e21]	366	self.kernel(queuei, self.input.global_size, None, *args)
[14de349]	367	cl.enqueue_copy(queuei, self.res, res_bi)
	368
	369	return self.res
	370
	371	def release(self):
	372	for b in self.loops_b:
	373	b.release()
	374	self.loops_b = []
	375	for b in self.res_b:
	376	b.release()
	377	self.res_b = []
	378
	379	def __del__(self):
	380	self.release()

Note: See TracBrowser for help on using the repository browser.

SasView

source: sasmodels/sasmodels/kernelcl.py @ 250fa25

Download in other formats: