kernelcl.py @ 750ffa5

core_shell_microgelscostrafo411magnetic_modelrelease_v0.94release_v0.95ticket-1257-vesicle-productticket_1156ticket_1265_superballticket_822_more_unit_tests

Last change on this file since 750ffa5 was 750ffa5, checked in by Paul Kienzle <pkienzle@…>, 9 years ago
allow test of dll using single precision
Property mode set to `100644`
File size: 13.9 KB

Line
1	"""
2	GPU support through OpenCL
3
4	There should be a single GPU environment running on the system. This
5	environment is constructed on the first call to :func:`env`, and the
6	same environment is returned on each call.
7
8	After retrieving the environment, the next step is to create the kernel.
9	This is done with a call to :meth:`GpuEnvironment.make_kernel`, which
10	returns the type of data used by the kernel.
11
12	Next a :class:`GpuData` object should be created with the correct kind
13	of data. This data object can be used by multiple kernels, for example,
14	if the target model is a weighted sum of multiple kernels. The data
15	should include any extra evaluation points required to compute the proper
16	data smearing. This need not match the square grid for 2D data if there
17	is an index saying which q points are active.
18
19	Together the GpuData, the program, and a device form a :class:`GpuKernel`.
20	This kernel is used during fitting, receiving new sets of parameters and
21	evaluating them. The output value is stored in an output buffer on the
22	devices, where it can be combined with other structure factors and form
23	factors and have instrumental resolution effects applied.
24	"""
25	import os
26	import warnings
27
28	import numpy as np
29
30	try:
31	import pyopencl as cl
32	# Ask OpenCL for the default context so that we know that one exists
33	cl.create_some_context(interactive=False)
34	except Exception, exc:
35	warnings.warn(str(exc))
36	raise RuntimeError("OpenCL not available")
37
38	from pyopencl import mem_flags as mf
39
40	from . import generate
41	from .kernelpy import PyModel
42
43	F64_DEFS = """\
44	#ifdef cl_khr_fp64
45	# pragma OPENCL EXTENSION cl_khr_fp64: enable
46	#endif
47	"""
48
49	# The max loops number is limited by the amount of local memory available
50	# on the device. You don't want to make this value too big because it will
51	# waste resources, nor too small because it may interfere with users trying
52	# to do their polydispersity calculations. A value of 1024 should be much
53	# larger than necessary given that cost grows as npts^k where k is the number
54	# of polydisperse parameters.
55	MAX_LOOPS = 2048
56
57	def load_model(kernel_module, dtype="single"):
58	"""
59	Load the OpenCL model defined by kernel_module.
60
61	Access to the OpenCL device is delayed until the kernel is called
62	so models can be defined without using too many resources.
63	"""
64	source, info = generate.make(kernel_module)
65	if callable(info.get('Iq', None)):
66	return PyModel(info)
67	## for debugging, save source to a .cl file, edit it, and reload as model
68	#open(info['name']+'.cl','w').write(source)
69	#source = open(info['name']+'.cl','r').read()
70	return GpuModel(source, info, np.dtype(dtype))
71
72	ENV = None
73	def environment():
74	"""
75	Returns a singleton :class:`GpuEnvironment`.
76
77	This provides an OpenCL context and one queue per device.
78	"""
79	global ENV
80	if ENV is None:
81	ENV = GpuEnvironment()
82	return ENV
83
84	def has_double(device):
85	"""
86	Return true if device supports double precision.
87	"""
88	return "cl_khr_fp64" in device.extensions
89
90	def get_warp(kernel, queue):
91	"""
92	Return the size of an execution batch for kernel running on queue.
93	"""
94	return kernel.get_work_group_info(
95	cl.kernel_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
96	queue.device)
97
98	def _stretch_input(vector, dtype, extra=1e-3, boundary=32):
99	"""
100	Stretch an input vector to the correct boundary.
101
102	Performance on the kernels can drop by a factor of two or more if the
103	number of values to compute does not fall on a nice power of two
104	boundary. The trailing additional vector elements are given a
105	value of extra, and so f(extra) will be computed for each of
106	them. The returned array will thus be a subset of the computed array.
107
108	boundary should be a power of 2 which is at least 32 for good
109	performance on current platforms (as of Jan 2015). It should
110	probably be the max of get_warp(kernel,queue) and
111	device.min_data_type_align_size//4.
112	"""
113	remainder = vector.size % boundary
114	if remainder != 0:
115	size = vector.size + (boundary - remainder)
116	vector = np.hstack((vector, [extra] * (size - vector.size)))
117	return np.ascontiguousarray(vector, dtype=dtype)
118
119
120	def compile_model(context, source, dtype):
121	"""
122	Build a model to run on the gpu.
123
124	Returns the compiled program and its type. The returned type will
125	be float32 even if the desired type is float64 if any of the
126	devices in the context do not support the cl_khr_fp64 extension.
127	"""
128	dtype = np.dtype(dtype)
129	if dtype == generate.F64 and not all(has_double(d) for d in context.devices):
130	raise RuntimeError("Double precision not supported for devices")
131
132	header = F64_DEFS if dtype == generate.F64 else ""
133	if dtype == generate.F32:
134	source = generate.use_single(source)
135	# Note: USE_SINCOS makes the intel cpu slower under opencl
136	if context.devices[0].type == cl.device_type.GPU:
137	header += "#define USE_SINCOS\n"
138	program = cl.Program(context, header + source).build()
139	return program
140
141
142	def make_result(self, size):
143	self.res = np.empty(size, dtype=self.dtype)
144	self.res_b = cl.Buffer(self.program.context, mf.READ_WRITE, self.res.nbytes)
145	return self.res, self.res_b
146
147
148	# for now, this returns one device in the context
149	# TODO: create a context that contains all devices on all platforms
150	class GpuEnvironment(object):
151	"""
152	GPU context, with possibly many devices, and one queue per device.
153	"""
154	def __init__(self):
155	# find gpu context
156	#self.context = cl.create_some_context()
157
158	self.context = None
159	if 'PYOPENCL_CTX' in os.environ:
160	self._create_some_context()
161
162	if not self.context:
163	self.context = _get_default_context()
164
165	# Byte boundary for data alignment
166	#self.data_boundary = max(d.min_data_type_align_size
167	# for d in self.context.devices)
168	self.queues = [cl.CommandQueue(self.context, d)
169	for d in self.context.devices]
170	self.has_double = all(has_double(d) for d in self.context.devices)
171	self.compiled = {}
172
173	def _create_some_context(self):
174	try:
175	self.context = cl.create_some_context(interactive=False)
176	except Exception, exc:
177	warnings.warn(str(exc))
178	warnings.warn("pyopencl.create_some_context() failed")
179	warnings.warn("the environment variable 'PYOPENCL_CTX' might not be set correctly")
180
181	def compile_program(self, name, source, dtype):
182	if name not in self.compiled:
183	#print "compiling",name
184	self.compiled[name] = compile_model(self.context, source, dtype)
185	return self.compiled[name]
186
187	def release_program(self, name):
188	if name in self.compiled:
189	self.compiled[name].release()
190	del self.compiled[name]
191
192	def _get_default_context():
193	default = None
194	for platform in cl.get_platforms():
195	for device in platform.get_devices():
196	if device.type == cl.device_type.GPU:
197	return cl.Context([device])
198	if default is None:
199	default = device
200
201	if not default:
202	raise RuntimeError("OpenCL device not found")
203
204	return cl.Context([default])
205
206
207	class GpuModel(object):
208	"""
209	GPU wrapper for a single model.
210
211	source and info are the model source and interface as returned
212	from :func:`gen.make`.
213
214	dtype is the desired model precision. Any numpy dtype for single
215	or double precision floats will do, such as 'f', 'float32' or 'single'
216	for single and 'd', 'float64' or 'double' for double. Double precision
217	is an optional extension which may not be available on all devices.
218	"""
219	def __init__(self, source, info, dtype=generate.F32):
220	self.info = info
221	self.source = source
222	self.dtype = np.dtype(dtype)
223	self.program = None # delay program creation
224
225	def __getstate__(self):
226	state = self.__dict__.copy()
227	state['program'] = None
228	return state
229
230	def __setstate__(self, state):
231	self.__dict__ = state.copy()
232
233	def __call__(self, q_input):
234	if self.dtype != q_input.dtype:
235	raise TypeError("data is %s kernel is %s" % (q_input.dtype, self.dtype))
236	if self.program is None:
237	compiler = environment().compile_program
238	self.program = compiler(self.info['name'], self.source, self.dtype)
239	kernel_name = generate.kernel_name(self.info, q_input.is_2D)
240	kernel = getattr(self.program, kernel_name)
241	return GpuKernel(kernel, self.info, q_input)
242
243	def release(self):
244	if self.program is not None:
245	environment().release_program(self.info['name'])
246	self.program = None
247
248	def make_input(self, q_vectors):
249	"""
250	Make q input vectors available to the model.
251
252	Note that each model needs its own q vector even if the case of
253	mixture models because some models may be OpenCL, some may be
254	ctypes and some may be pure python.
255	"""
256	return GpuInput(q_vectors, dtype=self.dtype)
257
258	# TODO: check that we don't need a destructor for buffers which go out of scope
259	class GpuInput(object):
260	"""
261	Make q data available to the gpu.
262
263	q_vectors is a list of q vectors, which will be [q] for 1-D data,
264	and [qx, qy] for 2-D data. Internally, the vectors will be reallocated
265	to get the best performance on OpenCL, which may involve shifting and
266	stretching the array to better match the memory architecture. Additional
267	points will be evaluated with q=1e-3.
268
269	dtype is the data type for the q vectors. The data type should be
270	set to match that of the kernel, which is an attribute of
271	:class:`GpuProgram`. Note that not all kernels support double
272	precision, so even if the program was created for double precision,
273	the GpuProgram.dtype may be single precision.
274
275	Call :meth:`release` when complete. Even if not called directly, the
276	buffer will be released when the data object is freed.
277	"""
278	def __init__(self, q_vectors, dtype=generate.F32):
279	env = environment()
280	self.nq = q_vectors[0].size
281	self.dtype = np.dtype(dtype)
282	self.is_2D = (len(q_vectors) == 2)
283	# TODO: stretch input based on get_warp()
284	# not doing it now since warp depends on kernel, which is not known
285	# at this point, so instead using 32, which is good on the set of
286	# architectures tested so far.
287	self.q_vectors = [_stretch_input(q, self.dtype, 32) for q in q_vectors]
288	self.q_buffers = [
289	cl.Buffer(env.context, mf.READ_ONLY \| mf.COPY_HOST_PTR, hostbuf=q)
290	for q in self.q_vectors
291	]
292	self.global_size = [self.q_vectors[0].size]
293
294	def release(self):
295	for b in self.q_buffers:
296	b.release()
297	self.q_buffers = []
298
299	class GpuKernel(object):
300	"""
301	Callable SAS kernel.
302
303	kernel is the GpuKernel object to call.
304
305	info is the module information
306
307	q_input is the DllInput q vectors at which the kernel should be
308	evaluated.
309
310	The resulting call method takes the pars, a list of values for
311	the fixed parameters to the kernel, and pd_pars, a list of (value,weight)
312	vectors for the polydisperse parameters. cutoff determines the
313	integration limits: any points with combined weight less than cutoff
314	will not be calculated.
315
316	Call :meth:`release` when done with the kernel instance.
317	"""
318	def __init__(self, kernel, info, q_input):
319	self.q_input = q_input
320	self.kernel = kernel
321	self.info = info
322	self.res = np.empty(q_input.nq, q_input.dtype)
323	dim = '2d' if q_input.is_2D else '1d'
324	self.fixed_pars = info['partype']['fixed-' + dim]
325	self.pd_pars = info['partype']['pd-' + dim]
326
327	# Inputs and outputs for each kernel call
328	# Note: res may be shorter than res_b if global_size != nq
329	env = environment()
330	self.loops_b = [cl.Buffer(env.context, mf.READ_WRITE,
331	2 * MAX_LOOPS * q_input.dtype.itemsize)
332	for _ in env.queues]
333	self.res_b = [cl.Buffer(env.context, mf.READ_WRITE,
334	q_input.global_size[0] * q_input.dtype.itemsize)
335	for _ in env.queues]
336
337
338	def __call__(self, fixed_pars, pd_pars, cutoff=1e-5):
339	real = np.float32 if self.q_input.dtype == generate.F32 else np.float64
340
341	device_num = 0
342	queuei = environment().queues[device_num]
343	res_bi = self.res_b[device_num]
344	nq = np.uint32(self.q_input.nq)
345	if pd_pars:
346	cutoff = real(cutoff)
347	loops_N = [np.uint32(len(p[0])) for p in pd_pars]
348	loops = np.hstack(pd_pars) \
349	if pd_pars else np.empty(0, dtype=self.q_input.dtype)
350	loops = np.ascontiguousarray(loops.T, self.q_input.dtype).flatten()
351	#print "loops",Nloops, loops
352
353	#import sys; print >>sys.stderr,"opencl eval",pars
354	#print "opencl eval",pars
355	if len(loops) > 2 * MAX_LOOPS:
356	raise ValueError("too many polydispersity points")
357
358	loops_bi = self.loops_b[device_num]
359	cl.enqueue_copy(queuei, loops_bi, loops)
360	loops_l = cl.LocalMemory(len(loops.data))
361	#ctx = environment().context
362	#loops_bi = cl.Buffer(ctx, mf.READ_ONLY\|mf.COPY_HOST_PTR, hostbuf=loops)
363	dispersed = [loops_bi, loops_l, cutoff] + loops_N
364	else:
365	dispersed = []
366	fixed = [real(p) for p in fixed_pars]
367	args = self.q_input.q_buffers + [res_bi, nq] + dispersed + fixed
368	self.kernel(queuei, self.q_input.global_size, None, *args)
369	cl.enqueue_copy(queuei, self.res, res_bi)
370
371	return self.res
372
373	def release(self):
374	for b in self.loops_b:
375	b.release()
376	self.loops_b = []
377	for b in self.res_b:
378	b.release()
379	self.res_b = []
380
381	def __del__(self):
382	self.release()

Note: See TracBrowser for help on using the repository browser.

SasView

source: sasmodels/sasmodels/kernelcl.py @ 750ffa5

Download in other formats: