kernelcl.py @ 676351f

core_shell_microgelscostrafo411magnetic_modelrelease_v0.94release_v0.95ticket-1257-vesicle-productticket_1156ticket_1265_superballticket_822_more_unit_tests

Last change on this file since 676351f was 676351f, checked in by ajj, 9 years ago
Merge branch 'master' of https://github.com/SasView/sasmodels
Property mode set to `100644`
File size: 14.1 KB

Rev	Line
[14de349]	1	"""
	2	GPU support through OpenCL
	3
	4	There should be a single GPU environment running on the system. This
	5	environment is constructed on the first call to :func:`env`, and the
	6	same environment is returned on each call.
	7
	8	After retrieving the environment, the next step is to create the kernel.
	9	This is done with a call to :meth:`GpuEnvironment.make_kernel`, which
	10	returns the type of data used by the kernel.
	11
	12	Next a :class:`GpuData` object should be created with the correct kind
	13	of data. This data object can be used by multiple kernels, for example,
	14	if the target model is a weighted sum of multiple kernels. The data
	15	should include any extra evaluation points required to compute the proper
	16	data smearing. This need not match the square grid for 2D data if there
	17	is an index saying which q points are active.
	18
	19	Together the GpuData, the program, and a device form a :class:`GpuKernel`.
	20	This kernel is used during fitting, receiving new sets of parameters and
	21	evaluating them. The output value is stored in an output buffer on the
	22	devices, where it can be combined with other structure factors and form
	23	factors and have instrumental resolution effects applied.
	24	"""
[250fa25]	25	import os
	26	import warnings
	27
[14de349]	28	import numpy as np
[b3f6bc3]	29
[250fa25]	30	try:
	31	import pyopencl as cl
[da32ec3]	32	except ImportError,exc:
[250fa25]	33	warnings.warn(str(exc))
	34	raise RuntimeError("OpenCL not available")
	35
[7841376]	36	try:
	37	context = cl.create_some_context(interactive=False)
	38	del context
	39	except cl.RuntimeError, exc:
	40	warnings.warn(str(exc))
	41	raise RuntimeError("OpenCl not available")
	42
[14de349]	43	from pyopencl import mem_flags as mf
	44
[cb6ecf4]	45	from . import generate
[f786ff3]	46	from .kernelpy import PyInput, PyKernel
[14de349]	47
	48	F64_DEFS = """\
[d087487b]	49	#ifdef cl_khr_fp64
	50	# pragma OPENCL EXTENSION cl_khr_fp64: enable
	51	#endif
[14de349]	52	"""
	53
[ce27e21]	54	# The max loops number is limited by the amount of local memory available
	55	# on the device. You don't want to make this value too big because it will
	56	# waste resources, nor too small because it may interfere with users trying
	57	# to do their polydispersity calculations. A value of 1024 should be much
	58	# larger than necessary given that cost grows as npts^k where k is the number
	59	# of polydisperse parameters.
[5d4777d]	60	MAX_LOOPS = 2048
	61
	62	def load_model(kernel_module, dtype="single"):
	63	"""
	64	Load the OpenCL model defined by kernel_module.
	65
	66	Access to the OpenCL device is delayed until the kernel is called
	67	so models can be defined without using too many resources.
	68	"""
[cb6ecf4]	69	source, info = generate.make(kernel_module)
[5d4777d]	70	## for debugging, save source to a .cl file, edit it, and reload as model
[f4cf580]	71	#open(info['name']+'.cl','w').write(source)
[5d4777d]	72	#source = open(info['name']+'.cl','r').read()
	73	return GpuModel(source, info, dtype)
[ce27e21]	74
[14de349]	75	ENV = None
	76	def environment():
	77	"""
	78	Returns a singleton :class:`GpuEnvironment`.
	79
	80	This provides an OpenCL context and one queue per device.
	81	"""
	82	global ENV
	83	if ENV is None:
	84	ENV = GpuEnvironment()
	85	return ENV
	86
	87	def has_double(device):
	88	"""
	89	Return true if device supports double precision.
	90	"""
	91	return "cl_khr_fp64" in device.extensions
	92
[f5b9a6b]	93	def get_warp(kernel, queue):
	94	"""
	95	Return the size of an execution batch for kernel running on queue.
	96	"""
	97	return kernel.get_work_group_info(cl.kernel_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
	98	queue.device)
[14de349]	99
[f5b9a6b]	100	def _stretch_input(vector, dtype, extra=1e-3, boundary=32):
[14de349]	101	"""
	102	Stretch an input vector to the correct boundary.
	103
	104	Performance on the kernels can drop by a factor of two or more if the
	105	number of values to compute does not fall on a nice power of two
[f5b9a6b]	106	boundary. The trailing additional vector elements are given a
	107	value of extra, and so f(extra) will be computed for each of
	108	them. The returned array will thus be a subset of the computed array.
	109
	110	boundary should be a power of 2 which is at least 32 for good
	111	performance on current platforms (as of Jan 2015). It should
	112	probably be the max of get_warp(kernel,queue) and
	113	device.min_data_type_align_size//4.
	114	"""
[14de349]	115	remainder = vector.size%boundary
[f5b9a6b]	116	if remainder != 0:
	117	size = vector.size + (boundary - remainder)
[14de349]	118	vector = np.hstack((vector, [extra]*(size-vector.size)))
	119	return np.ascontiguousarray(vector, dtype=dtype)
	120
	121
	122	def compile_model(context, source, dtype):
	123	"""
	124	Build a model to run on the gpu.
	125
	126	Returns the compiled program and its type. The returned type will
	127	be float32 even if the desired type is float64 if any of the
	128	devices in the context do not support the cl_khr_fp64 extension.
	129	"""
	130	dtype = np.dtype(dtype)
[cb6ecf4]	131	if dtype==generate.F64 and not all(has_double(d) for d in context.devices):
[ce27e21]	132	raise RuntimeError("Double precision not supported for devices")
[14de349]	133
[cb6ecf4]	134	header = F64_DEFS if dtype == generate.F64 else ""
	135	if dtype == generate.F32:
	136	source = generate.use_single(source)
[14de349]	137	# Note: USE_SINCOS makes the intel cpu slower under opencl
	138	if context.devices[0].type == cl.device_type.GPU:
	139	header += "#define USE_SINCOS\n"
	140	program = cl.Program(context, header+source).build()
[ce27e21]	141	return program
[14de349]	142
	143
	144	def make_result(self, size):
	145	self.res = np.empty(size, dtype=self.dtype)
	146	self.res_b = cl.Buffer(self.program.context, mf.READ_WRITE, self.res.nbytes)
	147	return self.res, self.res_b
	148
	149
	150	# for now, this returns one device in the context
	151	# TODO: create a context that contains all devices on all platforms
	152	class GpuEnvironment(object):
	153	"""
	154	GPU context, with possibly many devices, and one queue per device.
	155	"""
	156	def __init__(self):
[250fa25]	157	# find gpu context
	158	#self.context = cl.create_some_context()
	159
	160	self.context = None
	161	if 'PYOPENCL_CTX' in os.environ:
	162	self._create_some_context()
	163
	164	if not self.context:
	165	self.context = self._find_context()
	166
[f5b9a6b]	167	# Byte boundary for data alignment
	168	#self.data_boundary = max(d.min_data_type_align_size
	169	# for d in self.context.devices)
[250fa25]	170	self.queues = [cl.CommandQueue(self.context, d)
	171	for d in self.context.devices]
[ce27e21]	172	self.has_double = all(has_double(d) for d in self.context.devices)
	173	self.compiled = {}
	174
[250fa25]	175	def _create_some_context(self):
	176	try:
	177	self.context = cl.create_some_context(interactive=False)
	178	except Exception,exc:
	179	warnings.warn(str(exc))
	180	warnings.warn("pyopencl.create_some_context() failed")
	181	warnings.warn("the environment variable 'PYOPENCL_CTX' might not be set correctly")
	182
	183	def _find_context(self):
	184	default = None
	185	for platform in cl.get_platforms():
	186	for device in platform.get_devices():
	187	if device.type == cl.device_type.GPU:
	188	return cl.Context([device])
	189	if default is None:
	190	default = device
	191
	192	if not default:
	193	raise RuntimeError("OpenCL device not found")
	194
	195	return cl.Context([default])
	196
[ce27e21]	197	def compile_program(self, name, source, dtype):
	198	if name not in self.compiled:
	199	#print "compiling",name
	200	self.compiled[name] = compile_model(self.context, source, dtype)
	201	return self.compiled[name]
	202
	203	def release_program(self, name):
	204	if name in self.compiled:
	205	self.compiled[name].release()
	206	del self.compiled[name]
[14de349]	207
[250fa25]	208
[14de349]	209	class GpuModel(object):
	210	"""
	211	GPU wrapper for a single model.
	212
[ce27e21]	213	source and info are the model source and interface as returned
[14de349]	214	from :func:`gen.make`.
	215
	216	dtype is the desired model precision. Any numpy dtype for single
	217	or double precision floats will do, such as 'f', 'float32' or 'single'
	218	for single and 'd', 'float64' or 'double' for double. Double precision
	219	is an optional extension which may not be available on all devices.
	220	"""
[cb6ecf4]	221	def __init__(self, source, info, dtype=generate.F32):
[ce27e21]	222	self.info = info
	223	self.source = source
	224	self.dtype = dtype
	225	self.program = None # delay program creation
[14de349]	226
[ce27e21]	227	def __getstate__(self):
	228	state = self.__dict__.copy()
	229	state['program'] = None
	230	return state
[14de349]	231
[ce27e21]	232	def __setstate__(self, state):
	233	self.__dict__ = state.copy()
	234
	235	def __call__(self, input):
[b3f6bc3]	236	# Support pure python kernel call
	237	if input.is_2D and callable(self.info['Iqxy']):
	238	return PyKernel(self.info['Iqxy'], self.info, input)
	239	elif not input.is_2D and callable(self.info['Iq']):
	240	return PyKernel(self.info['Iq'], self.info, input)
	241
[14de349]	242	if self.dtype != input.dtype:
	243	raise TypeError("data and kernel have different types")
[ce27e21]	244	if self.program is None:
	245	self.program = environment().compile_program(self.info['name'],self.source, self.dtype)
[cb6ecf4]	246	kernel_name = generate.kernel_name(self.info, input.is_2D)
[14de349]	247	kernel = getattr(self.program, kernel_name)
[ce27e21]	248	return GpuKernel(kernel, self.info, input)
	249
	250	def release(self):
	251	if self.program is not None:
	252	environment().release_program(self.info['name'])
	253	self.program = None
[14de349]	254
	255	def make_input(self, q_vectors):
	256	"""
	257	Make q input vectors available to the model.
	258
[b3f6bc3]	259	Note that each model needs its own q vector even if the case of
	260	mixture models because some models may be OpenCL, some may be
	261	ctypes and some may be pure python.
[14de349]	262	"""
[b3f6bc3]	263	# Support pure python kernel call
	264	if len(q_vectors) == 1 and callable(self.info['Iq']):
	265	return PyInput(q_vectors, dtype=self.dtype)
	266	elif callable(self.info['Iqxy']):
	267	return PyInput(q_vectors, dtype=self.dtype)
	268	else:
	269	return GpuInput(q_vectors, dtype=self.dtype)
[14de349]	270
	271	# TODO: check that we don't need a destructor for buffers which go out of scope
	272	class GpuInput(object):
	273	"""
	274	Make q data available to the gpu.
	275
	276	q_vectors is a list of q vectors, which will be [q] for 1-D data,
	277	and [qx, qy] for 2-D data. Internally, the vectors will be reallocated
	278	to get the best performance on OpenCL, which may involve shifting and
	279	stretching the array to better match the memory architecture. Additional
	280	points will be evaluated with q=1e-3.
	281
	282	dtype is the data type for the q vectors. The data type should be
	283	set to match that of the kernel, which is an attribute of
	284	:class:`GpuProgram`. Note that not all kernels support double
	285	precision, so even if the program was created for double precision,
	286	the GpuProgram.dtype may be single precision.
	287
	288	Call :meth:`release` when complete. Even if not called directly, the
	289	buffer will be released when the data object is freed.
	290	"""
[cb6ecf4]	291	def __init__(self, q_vectors, dtype=generate.F32):
[14de349]	292	env = environment()
	293	self.nq = q_vectors[0].size
	294	self.dtype = np.dtype(dtype)
	295	self.is_2D = (len(q_vectors) == 2)
[f5b9a6b]	296	# TODO: stretch input based on get_warp()
	297	# not doing it now since warp depends on kernel, which is not known
	298	# at this point, so instead using 32, which is good on the set of
	299	# architectures tested so far.
	300	self.q_vectors = [_stretch_input(q, self.dtype, 32) for q in q_vectors]
[14de349]	301	self.q_buffers = [
	302	cl.Buffer(env.context, mf.READ_ONLY \| mf.COPY_HOST_PTR, hostbuf=q)
	303	for q in self.q_vectors
	304	]
	305	self.global_size = [self.q_vectors[0].size]
	306
	307	def release(self):
	308	for b in self.q_buffers:
	309	b.release()
	310	self.q_buffers = []
	311
	312	class GpuKernel(object):
[ff7119b]	313	"""
	314	Callable SAS kernel.
	315
	316	kernel is the GpuKernel object to call.
	317
	318	info is the module information
	319
	320	input is the DllInput q vectors at which the kernel should be
	321	evaluated.
	322
	323	The resulting call method takes the pars, a list of values for
	324	the fixed parameters to the kernel, and pd_pars, a list of (value,weight)
	325	vectors for the polydisperse parameters. cutoff determines the
	326	integration limits: any points with combined weight less than cutoff
	327	will not be calculated.
	328
	329	Call :meth:`release` when done with the kernel instance.
	330	"""
[ce27e21]	331	def __init__(self, kernel, info, input):
[14de349]	332	self.input = input
	333	self.kernel = kernel
[ce27e21]	334	self.info = info
	335	self.res = np.empty(input.nq, input.dtype)
	336	dim = '2d' if input.is_2D else '1d'
	337	self.fixed_pars = info['partype']['fixed-'+dim]
	338	self.pd_pars = info['partype']['pd-'+dim]
[14de349]	339
	340	# Inputs and outputs for each kernel call
[ce27e21]	341	# Note: res may be shorter than res_b if global_size != nq
	342	env = environment()
[14de349]	343	self.loops_b = [cl.Buffer(env.context, mf.READ_WRITE,
[5d4777d]	344	2MAX_LOOPSinput.dtype.itemsize)
[14de349]	345	for _ in env.queues]
	346	self.res_b = [cl.Buffer(env.context, mf.READ_WRITE,
	347	input.global_size[0]*input.dtype.itemsize)
	348	for _ in env.queues]
	349
	350
[ce27e21]	351	def __call__(self, pars, pd_pars, cutoff=1e-5):
[cb6ecf4]	352	real = np.float32 if self.input.dtype == generate.F32 else np.float64
[ce27e21]	353	fixed = [real(p) for p in pars]
	354	cutoff = real(cutoff)
	355	loops = np.hstack(pd_pars)
	356	loops = np.ascontiguousarray(loops.T, self.input.dtype).flatten()
[5d4777d]	357	Nloops = [np.uint32(len(p[0])) for p in pd_pars]
	358	#print "loops",Nloops, loops
[ce27e21]	359
[1780d59]	360	#import sys; print >>sys.stderr,"opencl eval",pars
[ce27e21]	361	#print "opencl eval",pars
[5d4777d]	362	if len(loops) > 2*MAX_LOOPS:
[ce27e21]	363	raise ValueError("too many polydispersity points")
[14de349]	364	device_num = 0
	365	res_bi = self.res_b[device_num]
	366	queuei = environment().queues[device_num]
	367	loops_bi = self.loops_b[device_num]
[ce27e21]	368	loops_l = cl.LocalMemory(len(loops.data))
[14de349]	369	cl.enqueue_copy(queuei, loops_bi, loops)
	370	#ctx = environment().context
	371	#loops_bi = cl.Buffer(ctx, mf.READ_ONLY \| mf.COPY_HOST_PTR, hostbuf=loops)
[5d4777d]	372	args = self.input.q_buffers + [res_bi,loops_bi,loops_l,cutoff] + fixed + Nloops
[ce27e21]	373	self.kernel(queuei, self.input.global_size, None, *args)
[14de349]	374	cl.enqueue_copy(queuei, self.res, res_bi)
	375
	376	return self.res
	377
	378	def release(self):
	379	for b in self.loops_b:
	380	b.release()
	381	self.loops_b = []
	382	for b in self.res_b:
	383	b.release()
	384	self.res_b = []
	385
	386	def __del__(self):
	387	self.release()

Note: See TracBrowser for help on using the repository browser.

SasView

source: sasmodels/sasmodels/kernelcl.py @ 676351f

Download in other formats: