gpu.py @ 5d4777d

core_shell_microgelscostrafo411magnetic_modelrelease_v0.94release_v0.95ticket-1257-vesicle-productticket_1156ticket_1265_superballticket_822_more_unit_tests

Last change on this file since 5d4777d was 5d4777d, checked in by Paul Kienzle <pkienzle@…>, 10 years ago
reorganize, check and update models
Property mode set to `100644`
File size: 12.1 KB

Rev	Line
[14de349]	1	"""
	2	GPU support through OpenCL
	3
	4	There should be a single GPU environment running on the system. This
	5	environment is constructed on the first call to :func:`env`, and the
	6	same environment is returned on each call.
	7
	8	After retrieving the environment, the next step is to create the kernel.
	9	This is done with a call to :meth:`GpuEnvironment.make_kernel`, which
	10	returns the type of data used by the kernel.
	11
	12	Next a :class:`GpuData` object should be created with the correct kind
	13	of data. This data object can be used by multiple kernels, for example,
	14	if the target model is a weighted sum of multiple kernels. The data
	15	should include any extra evaluation points required to compute the proper
	16	data smearing. This need not match the square grid for 2D data if there
	17	is an index saying which q points are active.
	18
	19	Together the GpuData, the program, and a device form a :class:`GpuKernel`.
	20	This kernel is used during fitting, receiving new sets of parameters and
	21	evaluating them. The output value is stored in an output buffer on the
	22	devices, where it can be combined with other structure factors and form
	23	factors and have instrumental resolution effects applied.
	24
	25
	26	"""
	27	import numpy as np
	28	import pyopencl as cl
	29	from pyopencl import mem_flags as mf
	30
	31	from . import gen
	32
	33	F32_DEFS = """\
	34	#define REAL(x) (x##f)
	35	#define real float
	36	"""
	37
	38	F64_DEFS = """\
	39	#pragma OPENCL EXTENSION cl_khr_fp64: enable
	40	#define REAL(x) (x)
	41	#define real double
	42	"""
	43
[ce27e21]	44	# The max loops number is limited by the amount of local memory available
	45	# on the device. You don't want to make this value too big because it will
	46	# waste resources, nor too small because it may interfere with users trying
	47	# to do their polydispersity calculations. A value of 1024 should be much
	48	# larger than necessary given that cost grows as npts^k where k is the number
	49	# of polydisperse parameters.
[5d4777d]	50	MAX_LOOPS = 2048
	51
	52	def load_model(kernel_module, dtype="single"):
	53	"""
	54	Load the OpenCL model defined by kernel_module.
	55
	56	Access to the OpenCL device is delayed until the kernel is called
	57	so models can be defined without using too many resources.
	58	"""
	59	source, info = gen.make(kernel_module)
	60	## for debugging, save source to a .cl file, edit it, and reload as model
	61	open(info['name']+'.cl','w').write(source)
	62	#source = open(info['name']+'.cl','r').read()
	63	return GpuModel(source, info, dtype)
[ce27e21]	64
[14de349]	65	ENV = None
	66	def environment():
	67	"""
	68	Returns a singleton :class:`GpuEnvironment`.
	69
	70	This provides an OpenCL context and one queue per device.
	71	"""
	72	global ENV
	73	if ENV is None:
	74	ENV = GpuEnvironment()
	75	return ENV
	76
	77	def has_double(device):
	78	"""
	79	Return true if device supports double precision.
	80	"""
	81	return "cl_khr_fp64" in device.extensions
	82
	83
	84	def _stretch_input(vector, dtype, extra=1e-3, boundary=128):
	85	"""
	86	Stretch an input vector to the correct boundary.
	87
	88	Performance on the kernels can drop by a factor of two or more if the
	89	number of values to compute does not fall on a nice power of two
	90	boundary. A good choice for the boundary value is the
	91	min_data_type_align_size property of the OpenCL device. The usual
	92	value of 128 gives a working size as a multiple of 32. The trailing
	93	additional vector elements are given a value of extra, and so
	94	f(extra) will be computed for each of them. The returned array
	95	will thus be a subset of the computed array.
	96	"""
	97	boundary // dtype.itemsize
	98	remainder = vector.size%boundary
	99	size = vector.size + (boundary - remainder if remainder != 0 else 0)
	100	if size != vector.size:
	101	vector = np.hstack((vector, [extra]*(size-vector.size)))
	102	return np.ascontiguousarray(vector, dtype=dtype)
	103
	104
	105	def compile_model(context, source, dtype):
	106	"""
	107	Build a model to run on the gpu.
	108
	109	Returns the compiled program and its type. The returned type will
	110	be float32 even if the desired type is float64 if any of the
	111	devices in the context do not support the cl_khr_fp64 extension.
	112	"""
	113	dtype = np.dtype(dtype)
[5d4777d]	114	if dtype==gen.F64 and not all(has_double(d) for d in context.devices):
[ce27e21]	115	raise RuntimeError("Double precision not supported for devices")
[14de349]	116
[5d4777d]	117	header = F64_DEFS if dtype == gen.F64 else F32_DEFS
[14de349]	118	# Note: USE_SINCOS makes the intel cpu slower under opencl
	119	if context.devices[0].type == cl.device_type.GPU:
	120	header += "#define USE_SINCOS\n"
	121	program = cl.Program(context, header+source).build()
[ce27e21]	122	return program
[14de349]	123
	124
	125	def make_result(self, size):
	126	self.res = np.empty(size, dtype=self.dtype)
	127	self.res_b = cl.Buffer(self.program.context, mf.READ_WRITE, self.res.nbytes)
	128	return self.res, self.res_b
	129
	130
	131	# for now, this returns one device in the context
	132	# TODO: create a context that contains all devices on all platforms
	133	class GpuEnvironment(object):
	134	"""
	135	GPU context, with possibly many devices, and one queue per device.
	136	"""
	137	def __init__(self):
	138	self.context = cl.create_some_context()
	139	self.queues = [cl.CommandQueue(self.context, d)
	140	for d in self.context.devices]
	141	self.boundary = max(d.min_data_type_align_size
	142	for d in self.context.devices)
[ce27e21]	143	self.has_double = all(has_double(d) for d in self.context.devices)
	144	self.compiled = {}
	145
	146	def compile_program(self, name, source, dtype):
	147	if name not in self.compiled:
	148	#print "compiling",name
	149	self.compiled[name] = compile_model(self.context, source, dtype)
	150	return self.compiled[name]
	151
	152	def release_program(self, name):
	153	if name in self.compiled:
	154	self.compiled[name].release()
	155	del self.compiled[name]
[14de349]	156
	157	class GpuModel(object):
	158	"""
	159	GPU wrapper for a single model.
	160
[ce27e21]	161	source and info are the model source and interface as returned
[14de349]	162	from :func:`gen.make`.
	163
	164	dtype is the desired model precision. Any numpy dtype for single
	165	or double precision floats will do, such as 'f', 'float32' or 'single'
	166	for single and 'd', 'float64' or 'double' for double. Double precision
	167	is an optional extension which may not be available on all devices.
	168	"""
[5d4777d]	169	def __init__(self, source, info, dtype=gen.F32):
[ce27e21]	170	self.info = info
	171	self.source = source
	172	self.dtype = dtype
	173	self.program = None # delay program creation
[14de349]	174
[ce27e21]	175	def __getstate__(self):
	176	state = self.__dict__.copy()
	177	state['program'] = None
	178	return state
[14de349]	179
[ce27e21]	180	def __setstate__(self, state):
	181	self.__dict__ = state.copy()
	182
	183	def __call__(self, input):
[14de349]	184	if self.dtype != input.dtype:
	185	raise TypeError("data and kernel have different types")
[ce27e21]	186	if self.program is None:
	187	self.program = environment().compile_program(self.info['name'],self.source, self.dtype)
	188	kernel_name = gen.kernel_name(self.info, input.is_2D)
[14de349]	189	kernel = getattr(self.program, kernel_name)
[ce27e21]	190	return GpuKernel(kernel, self.info, input)
	191
	192	def release(self):
	193	if self.program is not None:
	194	environment().release_program(self.info['name'])
	195	self.program = None
[14de349]	196
	197	def make_input(self, q_vectors):
	198	"""
	199	Make q input vectors available to the model.
	200
	201	This only needs to be done once for all models that operate on the
	202	same input. So for example, if you are adding two different models
	203	together to compare to a data set, then only one model needs to
	204	needs to call make_input, so long as the models have the same dtype.
	205	"""
	206	# Note: the weird interface, where make_input doesn't care which
	207	# model calls it, allows us to ask the model to define the data
	208	# and the caller does not need to know if it is opencl or ctypes.
	209	# The returned data object is opaque.
	210	return GpuInput(q_vectors, dtype=self.dtype)
	211
	212	# TODO: check that we don't need a destructor for buffers which go out of scope
	213	class GpuInput(object):
	214	"""
	215	Make q data available to the gpu.
	216
	217	q_vectors is a list of q vectors, which will be [q] for 1-D data,
	218	and [qx, qy] for 2-D data. Internally, the vectors will be reallocated
	219	to get the best performance on OpenCL, which may involve shifting and
	220	stretching the array to better match the memory architecture. Additional
	221	points will be evaluated with q=1e-3.
	222
	223	dtype is the data type for the q vectors. The data type should be
	224	set to match that of the kernel, which is an attribute of
	225	:class:`GpuProgram`. Note that not all kernels support double
	226	precision, so even if the program was created for double precision,
	227	the GpuProgram.dtype may be single precision.
	228
	229	Call :meth:`release` when complete. Even if not called directly, the
	230	buffer will be released when the data object is freed.
	231	"""
[5d4777d]	232	def __init__(self, q_vectors, dtype=gen.F32):
[14de349]	233	env = environment()
	234	self.nq = q_vectors[0].size
	235	self.dtype = np.dtype(dtype)
	236	self.is_2D = (len(q_vectors) == 2)
	237	self.q_vectors = [
	238	_stretch_input(q, self.dtype, boundary=env.boundary)
	239	for q in q_vectors
	240	]
	241	self.q_buffers = [
	242	cl.Buffer(env.context, mf.READ_ONLY \| mf.COPY_HOST_PTR, hostbuf=q)
	243	for q in self.q_vectors
	244	]
	245	self.global_size = [self.q_vectors[0].size]
	246
	247	def release(self):
	248	for b in self.q_buffers:
	249	b.release()
	250	self.q_buffers = []
	251
	252	class GpuKernel(object):
[ff7119b]	253	"""
	254	Callable SAS kernel.
	255
	256	kernel is the GpuKernel object to call.
	257
	258	info is the module information
	259
	260	input is the DllInput q vectors at which the kernel should be
	261	evaluated.
	262
	263	The resulting call method takes the pars, a list of values for
	264	the fixed parameters to the kernel, and pd_pars, a list of (value,weight)
	265	vectors for the polydisperse parameters. cutoff determines the
	266	integration limits: any points with combined weight less than cutoff
	267	will not be calculated.
	268
	269	Call :meth:`release` when done with the kernel instance.
	270	"""
[ce27e21]	271	def __init__(self, kernel, info, input):
[14de349]	272	self.input = input
	273	self.kernel = kernel
[ce27e21]	274	self.info = info
	275	self.res = np.empty(input.nq, input.dtype)
	276	dim = '2d' if input.is_2D else '1d'
	277	self.fixed_pars = info['partype']['fixed-'+dim]
	278	self.pd_pars = info['partype']['pd-'+dim]
[14de349]	279
	280	# Inputs and outputs for each kernel call
[ce27e21]	281	# Note: res may be shorter than res_b if global_size != nq
	282	env = environment()
[14de349]	283	self.loops_b = [cl.Buffer(env.context, mf.READ_WRITE,
[5d4777d]	284	2MAX_LOOPSinput.dtype.itemsize)
[14de349]	285	for _ in env.queues]
	286	self.res_b = [cl.Buffer(env.context, mf.READ_WRITE,
	287	input.global_size[0]*input.dtype.itemsize)
	288	for _ in env.queues]
	289
	290
[ce27e21]	291	def __call__(self, pars, pd_pars, cutoff=1e-5):
[5d4777d]	292	real = np.float32 if self.input.dtype == gen.F32 else np.float64
[ce27e21]	293	fixed = [real(p) for p in pars]
	294	cutoff = real(cutoff)
	295	loops = np.hstack(pd_pars)
	296	loops = np.ascontiguousarray(loops.T, self.input.dtype).flatten()
[5d4777d]	297	Nloops = [np.uint32(len(p[0])) for p in pd_pars]
	298	#print "loops",Nloops, loops
[ce27e21]	299
[1780d59]	300	#import sys; print >>sys.stderr,"opencl eval",pars
[ce27e21]	301	#print "opencl eval",pars
[5d4777d]	302	if len(loops) > 2*MAX_LOOPS:
[ce27e21]	303	raise ValueError("too many polydispersity points")
[14de349]	304	device_num = 0
	305	res_bi = self.res_b[device_num]
	306	queuei = environment().queues[device_num]
	307	loops_bi = self.loops_b[device_num]
[ce27e21]	308	loops_l = cl.LocalMemory(len(loops.data))
[14de349]	309	cl.enqueue_copy(queuei, loops_bi, loops)
	310	#ctx = environment().context
	311	#loops_bi = cl.Buffer(ctx, mf.READ_ONLY \| mf.COPY_HOST_PTR, hostbuf=loops)
[5d4777d]	312	args = self.input.q_buffers + [res_bi,loops_bi,loops_l,cutoff] + fixed + Nloops
[ce27e21]	313	self.kernel(queuei, self.input.global_size, None, *args)
[14de349]	314	cl.enqueue_copy(queuei, self.res, res_bi)
	315
	316	return self.res
	317
	318	def release(self):
	319	for b in self.loops_b:
	320	b.release()
	321	self.loops_b = []
	322	for b in self.res_b:
	323	b.release()
	324	self.res_b = []
	325
	326	def __del__(self):
	327	self.release()

Note: See TracBrowser for help on using the repository browser.

SasView

source: sasmodels/sasmodels/gpu.py @ 5d4777d

Download in other formats: