kernelcl.py @ aa4946b

core_shell_microgelscostrafo411magnetic_modelrelease_v0.94release_v0.95ticket-1257-vesicle-productticket_1156ticket_1265_superballticket_822_more_unit_tests

Last change on this file since aa4946b was aa4946b, checked in by Paul Kienzle <pkienzle@…>, 9 years ago
refactor so kernels are loaded via core.load_model
Property mode set to `100644`
File size: 13.9 KB

Line
1	"""
2	GPU support through OpenCL
3
4	There should be a single GPU environment running on the system. This
5	environment is constructed on the first call to :func:`env`, and the
6	same environment is returned on each call.
7
8	After retrieving the environment, the next step is to create the kernel.
9	This is done with a call to :meth:`GpuEnvironment.make_kernel`, which
10	returns the type of data used by the kernel.
11
12	Next a :class:`GpuData` object should be created with the correct kind
13	of data. This data object can be used by multiple kernels, for example,
14	if the target model is a weighted sum of multiple kernels. The data
15	should include any extra evaluation points required to compute the proper
16	data smearing. This need not match the square grid for 2D data if there
17	is an index saying which q points are active.
18
19	Together the GpuData, the program, and a device form a :class:`GpuKernel`.
20	This kernel is used during fitting, receiving new sets of parameters and
21	evaluating them. The output value is stored in an output buffer on the
22	devices, where it can be combined with other structure factors and form
23	factors and have instrumental resolution effects applied.
24	"""
25	import os
26	import warnings
27
28	import numpy as np
29
30	try:
31	import pyopencl as cl
32	# Ask OpenCL for the default context so that we know that one exists
33	cl.create_some_context(interactive=False)
34	except Exception, exc:
35	warnings.warn(str(exc))
36	raise RuntimeError("OpenCL not available")
37
38	from pyopencl import mem_flags as mf
39
40	from . import generate
41	from .kernelpy import PyModel
42
43	F64_DEFS = """\
44	#ifdef cl_khr_fp64
45	# pragma OPENCL EXTENSION cl_khr_fp64: enable
46	#endif
47	"""
48
49	# The max loops number is limited by the amount of local memory available
50	# on the device. You don't want to make this value too big because it will
51	# waste resources, nor too small because it may interfere with users trying
52	# to do their polydispersity calculations. A value of 1024 should be much
53	# larger than necessary given that cost grows as npts^k where k is the number
54	# of polydisperse parameters.
55	MAX_LOOPS = 2048
56
57	def _load_model(kernel_module, dtype="single"):
58	"""
59	Load the OpenCL model defined by kernel_module.
60
61	Access to the OpenCL device is delayed until the kernel is called
62	so models can be defined without using too many resources.
63	"""
64	raise DeprecationWarning
65	source, info = generate.make(kernel_module)
66	if callable(info.get('Iq', None)):
67	return PyModel(info)
68	## for debugging, save source to a .cl file, edit it, and reload as model
69	#open(info['name']+'.cl','w').write(source)
70	#source = open(info['name']+'.cl','r').read()
71	return GpuModel(source, info, np.dtype(dtype))
72
73	ENV = None
74	def environment():
75	"""
76	Returns a singleton :class:`GpuEnvironment`.
77
78	This provides an OpenCL context and one queue per device.
79	"""
80	global ENV
81	if ENV is None:
82	ENV = GpuEnvironment()
83	return ENV
84
85	def has_double(device):
86	"""
87	Return true if device supports double precision.
88	"""
89	return "cl_khr_fp64" in device.extensions
90
91	def get_warp(kernel, queue):
92	"""
93	Return the size of an execution batch for kernel running on queue.
94	"""
95	return kernel.get_work_group_info(
96	cl.kernel_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
97	queue.device)
98
99	def _stretch_input(vector, dtype, extra=1e-3, boundary=32):
100	"""
101	Stretch an input vector to the correct boundary.
102
103	Performance on the kernels can drop by a factor of two or more if the
104	number of values to compute does not fall on a nice power of two
105	boundary. The trailing additional vector elements are given a
106	value of extra, and so f(extra) will be computed for each of
107	them. The returned array will thus be a subset of the computed array.
108
109	boundary should be a power of 2 which is at least 32 for good
110	performance on current platforms (as of Jan 2015). It should
111	probably be the max of get_warp(kernel,queue) and
112	device.min_data_type_align_size//4.
113	"""
114	remainder = vector.size % boundary
115	if remainder != 0:
116	size = vector.size + (boundary - remainder)
117	vector = np.hstack((vector, [extra] * (size - vector.size)))
118	return np.ascontiguousarray(vector, dtype=dtype)
119
120
121	def compile_model(context, source, dtype):
122	"""
123	Build a model to run on the gpu.
124
125	Returns the compiled program and its type. The returned type will
126	be float32 even if the desired type is float64 if any of the
127	devices in the context do not support the cl_khr_fp64 extension.
128	"""
129	dtype = np.dtype(dtype)
130	if dtype == generate.F64 and not all(has_double(d) for d in context.devices):
131	raise RuntimeError("Double precision not supported for devices")
132
133	header = F64_DEFS if dtype == generate.F64 else ""
134	if dtype == generate.F32:
135	source = generate.use_single(source)
136	# Note: USE_SINCOS makes the intel cpu slower under opencl
137	if context.devices[0].type == cl.device_type.GPU:
138	header += "#define USE_SINCOS\n"
139	program = cl.Program(context, header + source).build()
140	return program
141
142
143	def make_result(self, size):
144	self.res = np.empty(size, dtype=self.dtype)
145	self.res_b = cl.Buffer(self.program.context, mf.READ_WRITE, self.res.nbytes)
146	return self.res, self.res_b
147
148
149	# for now, this returns one device in the context
150	# TODO: create a context that contains all devices on all platforms
151	class GpuEnvironment(object):
152	"""
153	GPU context, with possibly many devices, and one queue per device.
154	"""
155	def __init__(self):
156	# find gpu context
157	#self.context = cl.create_some_context()
158
159	self.context = None
160	if 'PYOPENCL_CTX' in os.environ:
161	self._create_some_context()
162
163	if not self.context:
164	self.context = _get_default_context()
165
166	# Byte boundary for data alignment
167	#self.data_boundary = max(d.min_data_type_align_size
168	# for d in self.context.devices)
169	self.queues = [cl.CommandQueue(self.context, d)
170	for d in self.context.devices]
171	self.has_double = all(has_double(d) for d in self.context.devices)
172	self.compiled = {}
173
174	def _create_some_context(self):
175	try:
176	self.context = cl.create_some_context(interactive=False)
177	except Exception, exc:
178	warnings.warn(str(exc))
179	warnings.warn("pyopencl.create_some_context() failed")
180	warnings.warn("the environment variable 'PYOPENCL_CTX' might not be set correctly")
181
182	def compile_program(self, name, source, dtype):
183	if name not in self.compiled:
184	#print "compiling",name
185	self.compiled[name] = compile_model(self.context, source, dtype)
186	return self.compiled[name]
187
188	def release_program(self, name):
189	if name in self.compiled:
190	self.compiled[name].release()
191	del self.compiled[name]
192
193	def _get_default_context():
194	default = None
195	for platform in cl.get_platforms():
196	for device in platform.get_devices():
197	if device.type == cl.device_type.GPU:
198	return cl.Context([device])
199	if default is None:
200	default = device
201
202	if not default:
203	raise RuntimeError("OpenCL device not found")
204
205	return cl.Context([default])
206
207
208	class GpuModel(object):
209	"""
210	GPU wrapper for a single model.
211
212	source and info are the model source and interface as returned
213	from :func:`gen.make`.
214
215	dtype is the desired model precision. Any numpy dtype for single
216	or double precision floats will do, such as 'f', 'float32' or 'single'
217	for single and 'd', 'float64' or 'double' for double. Double precision
218	is an optional extension which may not be available on all devices.
219	"""
220	def __init__(self, source, info, dtype=generate.F32):
221	self.info = info
222	self.source = source
223	self.dtype = np.dtype(dtype)
224	self.program = None # delay program creation
225
226	def __getstate__(self):
227	state = self.__dict__.copy()
228	state['program'] = None
229	return state
230
231	def __setstate__(self, state):
232	self.__dict__ = state.copy()
233
234	def __call__(self, q_input):
235	if self.dtype != q_input.dtype:
236	raise TypeError("data is %s kernel is %s" % (q_input.dtype, self.dtype))
237	if self.program is None:
238	compiler = environment().compile_program
239	self.program = compiler(self.info['name'], self.source, self.dtype)
240	kernel_name = generate.kernel_name(self.info, q_input.is_2D)
241	kernel = getattr(self.program, kernel_name)
242	return GpuKernel(kernel, self.info, q_input)
243
244	def release(self):
245	if self.program is not None:
246	environment().release_program(self.info['name'])
247	self.program = None
248
249	def make_input(self, q_vectors):
250	"""
251	Make q input vectors available to the model.
252
253	Note that each model needs its own q vector even if the case of
254	mixture models because some models may be OpenCL, some may be
255	ctypes and some may be pure python.
256	"""
257	return GpuInput(q_vectors, dtype=self.dtype)
258
259	# TODO: check that we don't need a destructor for buffers which go out of scope
260	class GpuInput(object):
261	"""
262	Make q data available to the gpu.
263
264	q_vectors is a list of q vectors, which will be [q] for 1-D data,
265	and [qx, qy] for 2-D data. Internally, the vectors will be reallocated
266	to get the best performance on OpenCL, which may involve shifting and
267	stretching the array to better match the memory architecture. Additional
268	points will be evaluated with q=1e-3.
269
270	dtype is the data type for the q vectors. The data type should be
271	set to match that of the kernel, which is an attribute of
272	:class:`GpuProgram`. Note that not all kernels support double
273	precision, so even if the program was created for double precision,
274	the GpuProgram.dtype may be single precision.
275
276	Call :meth:`release` when complete. Even if not called directly, the
277	buffer will be released when the data object is freed.
278	"""
279	def __init__(self, q_vectors, dtype=generate.F32):
280	env = environment()
281	self.nq = q_vectors[0].size
282	self.dtype = np.dtype(dtype)
283	self.is_2D = (len(q_vectors) == 2)
284	# TODO: stretch input based on get_warp()
285	# not doing it now since warp depends on kernel, which is not known
286	# at this point, so instead using 32, which is good on the set of
287	# architectures tested so far.
288	self.q_vectors = [_stretch_input(q, self.dtype, 32) for q in q_vectors]
289	self.q_buffers = [
290	cl.Buffer(env.context, mf.READ_ONLY \| mf.COPY_HOST_PTR, hostbuf=q)
291	for q in self.q_vectors
292	]
293	self.global_size = [self.q_vectors[0].size]
294
295	def release(self):
296	for b in self.q_buffers:
297	b.release()
298	self.q_buffers = []
299
300	class GpuKernel(object):
301	"""
302	Callable SAS kernel.
303
304	kernel is the GpuKernel object to call.
305
306	info is the module information
307
308	q_input is the DllInput q vectors at which the kernel should be
309	evaluated.
310
311	The resulting call method takes the pars, a list of values for
312	the fixed parameters to the kernel, and pd_pars, a list of (value,weight)
313	vectors for the polydisperse parameters. cutoff determines the
314	integration limits: any points with combined weight less than cutoff
315	will not be calculated.
316
317	Call :meth:`release` when done with the kernel instance.
318	"""
319	def __init__(self, kernel, info, q_input):
320	self.q_input = q_input
321	self.kernel = kernel
322	self.info = info
323	self.res = np.empty(q_input.nq, q_input.dtype)
324	dim = '2d' if q_input.is_2D else '1d'
325	self.fixed_pars = info['partype']['fixed-' + dim]
326	self.pd_pars = info['partype']['pd-' + dim]
327
328	# Inputs and outputs for each kernel call
329	# Note: res may be shorter than res_b if global_size != nq
330	env = environment()
331	self.loops_b = [cl.Buffer(env.context, mf.READ_WRITE,
332	2 * MAX_LOOPS * q_input.dtype.itemsize)
333	for _ in env.queues]
334	self.res_b = [cl.Buffer(env.context, mf.READ_WRITE,
335	q_input.global_size[0] * q_input.dtype.itemsize)
336	for _ in env.queues]
337
338
339	def __call__(self, fixed_pars, pd_pars, cutoff=1e-5):
340	real = np.float32 if self.q_input.dtype == generate.F32 else np.float64
341
342	device_num = 0
343	queuei = environment().queues[device_num]
344	res_bi = self.res_b[device_num]
345	nq = np.uint32(self.q_input.nq)
346	if pd_pars:
347	cutoff = real(cutoff)
348	loops_N = [np.uint32(len(p[0])) for p in pd_pars]
349	loops = np.hstack(pd_pars) \
350	if pd_pars else np.empty(0, dtype=self.q_input.dtype)
351	loops = np.ascontiguousarray(loops.T, self.q_input.dtype).flatten()
352	#print "loops",Nloops, loops
353
354	#import sys; print >>sys.stderr,"opencl eval",pars
355	#print "opencl eval",pars
356	if len(loops) > 2 * MAX_LOOPS:
357	raise ValueError("too many polydispersity points")
358
359	loops_bi = self.loops_b[device_num]
360	cl.enqueue_copy(queuei, loops_bi, loops)
361	loops_l = cl.LocalMemory(len(loops.data))
362	#ctx = environment().context
363	#loops_bi = cl.Buffer(ctx, mf.READ_ONLY\|mf.COPY_HOST_PTR, hostbuf=loops)
364	dispersed = [loops_bi, loops_l, cutoff] + loops_N
365	else:
366	dispersed = []
367	fixed = [real(p) for p in fixed_pars]
368	args = self.q_input.q_buffers + [res_bi, nq] + dispersed + fixed
369	self.kernel(queuei, self.q_input.global_size, None, *args)
370	cl.enqueue_copy(queuei, self.res, res_bi)
371
372	return self.res
373
374	def release(self):
375	for b in self.loops_b:
376	b.release()
377	self.loops_b = []
378	for b in self.res_b:
379	b.release()
380	self.res_b = []
381
382	def __del__(self):
383	self.release()

Note: See TracBrowser for help on using the repository browser.

SasView

source: sasmodels/sasmodels/kernelcl.py @ aa4946b

Download in other formats: