kernelcl.py @ 676351f

core_shell_microgelscostrafo411magnetic_modelrelease_v0.94release_v0.95ticket-1257-vesicle-productticket_1156ticket_1265_superballticket_822_more_unit_tests

Last change on this file since 676351f was 676351f, checked in by ajj, 9 years ago
Merge branch 'master' of https://github.com/SasView/sasmodels
Property mode set to `100644`
File size: 14.1 KB

Line
1	"""
2	GPU support through OpenCL
3
4	There should be a single GPU environment running on the system. This
5	environment is constructed on the first call to :func:`env`, and the
6	same environment is returned on each call.
7
8	After retrieving the environment, the next step is to create the kernel.
9	This is done with a call to :meth:`GpuEnvironment.make_kernel`, which
10	returns the type of data used by the kernel.
11
12	Next a :class:`GpuData` object should be created with the correct kind
13	of data. This data object can be used by multiple kernels, for example,
14	if the target model is a weighted sum of multiple kernels. The data
15	should include any extra evaluation points required to compute the proper
16	data smearing. This need not match the square grid for 2D data if there
17	is an index saying which q points are active.
18
19	Together the GpuData, the program, and a device form a :class:`GpuKernel`.
20	This kernel is used during fitting, receiving new sets of parameters and
21	evaluating them. The output value is stored in an output buffer on the
22	devices, where it can be combined with other structure factors and form
23	factors and have instrumental resolution effects applied.
24	"""
25	import os
26	import warnings
27
28	import numpy as np
29
30	try:
31	import pyopencl as cl
32	except ImportError,exc:
33	warnings.warn(str(exc))
34	raise RuntimeError("OpenCL not available")
35
36	try:
37	context = cl.create_some_context(interactive=False)
38	del context
39	except cl.RuntimeError, exc:
40	warnings.warn(str(exc))
41	raise RuntimeError("OpenCl not available")
42
43	from pyopencl import mem_flags as mf
44
45	from . import generate
46	from .kernelpy import PyInput, PyKernel
47
48	F64_DEFS = """\
49	#ifdef cl_khr_fp64
50	# pragma OPENCL EXTENSION cl_khr_fp64: enable
51	#endif
52	"""
53
54	# The max loops number is limited by the amount of local memory available
55	# on the device. You don't want to make this value too big because it will
56	# waste resources, nor too small because it may interfere with users trying
57	# to do their polydispersity calculations. A value of 1024 should be much
58	# larger than necessary given that cost grows as npts^k where k is the number
59	# of polydisperse parameters.
60	MAX_LOOPS = 2048
61
62	def load_model(kernel_module, dtype="single"):
63	"""
64	Load the OpenCL model defined by kernel_module.
65
66	Access to the OpenCL device is delayed until the kernel is called
67	so models can be defined without using too many resources.
68	"""
69	source, info = generate.make(kernel_module)
70	## for debugging, save source to a .cl file, edit it, and reload as model
71	#open(info['name']+'.cl','w').write(source)
72	#source = open(info['name']+'.cl','r').read()
73	return GpuModel(source, info, dtype)
74
75	ENV = None
76	def environment():
77	"""
78	Returns a singleton :class:`GpuEnvironment`.
79
80	This provides an OpenCL context and one queue per device.
81	"""
82	global ENV
83	if ENV is None:
84	ENV = GpuEnvironment()
85	return ENV
86
87	def has_double(device):
88	"""
89	Return true if device supports double precision.
90	"""
91	return "cl_khr_fp64" in device.extensions
92
93	def get_warp(kernel, queue):
94	"""
95	Return the size of an execution batch for kernel running on queue.
96	"""
97	return kernel.get_work_group_info(cl.kernel_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
98	queue.device)
99
100	def _stretch_input(vector, dtype, extra=1e-3, boundary=32):
101	"""
102	Stretch an input vector to the correct boundary.
103
104	Performance on the kernels can drop by a factor of two or more if the
105	number of values to compute does not fall on a nice power of two
106	boundary. The trailing additional vector elements are given a
107	value of extra, and so f(extra) will be computed for each of
108	them. The returned array will thus be a subset of the computed array.
109
110	boundary should be a power of 2 which is at least 32 for good
111	performance on current platforms (as of Jan 2015). It should
112	probably be the max of get_warp(kernel,queue) and
113	device.min_data_type_align_size//4.
114	"""
115	remainder = vector.size%boundary
116	if remainder != 0:
117	size = vector.size + (boundary - remainder)
118	vector = np.hstack((vector, [extra]*(size-vector.size)))
119	return np.ascontiguousarray(vector, dtype=dtype)
120
121
122	def compile_model(context, source, dtype):
123	"""
124	Build a model to run on the gpu.
125
126	Returns the compiled program and its type. The returned type will
127	be float32 even if the desired type is float64 if any of the
128	devices in the context do not support the cl_khr_fp64 extension.
129	"""
130	dtype = np.dtype(dtype)
131	if dtype==generate.F64 and not all(has_double(d) for d in context.devices):
132	raise RuntimeError("Double precision not supported for devices")
133
134	header = F64_DEFS if dtype == generate.F64 else ""
135	if dtype == generate.F32:
136	source = generate.use_single(source)
137	# Note: USE_SINCOS makes the intel cpu slower under opencl
138	if context.devices[0].type == cl.device_type.GPU:
139	header += "#define USE_SINCOS\n"
140	program = cl.Program(context, header+source).build()
141	return program
142
143
144	def make_result(self, size):
145	self.res = np.empty(size, dtype=self.dtype)
146	self.res_b = cl.Buffer(self.program.context, mf.READ_WRITE, self.res.nbytes)
147	return self.res, self.res_b
148
149
150	# for now, this returns one device in the context
151	# TODO: create a context that contains all devices on all platforms
152	class GpuEnvironment(object):
153	"""
154	GPU context, with possibly many devices, and one queue per device.
155	"""
156	def __init__(self):
157	# find gpu context
158	#self.context = cl.create_some_context()
159
160	self.context = None
161	if 'PYOPENCL_CTX' in os.environ:
162	self._create_some_context()
163
164	if not self.context:
165	self.context = self._find_context()
166
167	# Byte boundary for data alignment
168	#self.data_boundary = max(d.min_data_type_align_size
169	# for d in self.context.devices)
170	self.queues = [cl.CommandQueue(self.context, d)
171	for d in self.context.devices]
172	self.has_double = all(has_double(d) for d in self.context.devices)
173	self.compiled = {}
174
175	def _create_some_context(self):
176	try:
177	self.context = cl.create_some_context(interactive=False)
178	except Exception,exc:
179	warnings.warn(str(exc))
180	warnings.warn("pyopencl.create_some_context() failed")
181	warnings.warn("the environment variable 'PYOPENCL_CTX' might not be set correctly")
182
183	def _find_context(self):
184	default = None
185	for platform in cl.get_platforms():
186	for device in platform.get_devices():
187	if device.type == cl.device_type.GPU:
188	return cl.Context([device])
189	if default is None:
190	default = device
191
192	if not default:
193	raise RuntimeError("OpenCL device not found")
194
195	return cl.Context([default])
196
197	def compile_program(self, name, source, dtype):
198	if name not in self.compiled:
199	#print "compiling",name
200	self.compiled[name] = compile_model(self.context, source, dtype)
201	return self.compiled[name]
202
203	def release_program(self, name):
204	if name in self.compiled:
205	self.compiled[name].release()
206	del self.compiled[name]
207
208
209	class GpuModel(object):
210	"""
211	GPU wrapper for a single model.
212
213	source and info are the model source and interface as returned
214	from :func:`gen.make`.
215
216	dtype is the desired model precision. Any numpy dtype for single
217	or double precision floats will do, such as 'f', 'float32' or 'single'
218	for single and 'd', 'float64' or 'double' for double. Double precision
219	is an optional extension which may not be available on all devices.
220	"""
221	def __init__(self, source, info, dtype=generate.F32):
222	self.info = info
223	self.source = source
224	self.dtype = dtype
225	self.program = None # delay program creation
226
227	def __getstate__(self):
228	state = self.__dict__.copy()
229	state['program'] = None
230	return state
231
232	def __setstate__(self, state):
233	self.__dict__ = state.copy()
234
235	def __call__(self, input):
236	# Support pure python kernel call
237	if input.is_2D and callable(self.info['Iqxy']):
238	return PyKernel(self.info['Iqxy'], self.info, input)
239	elif not input.is_2D and callable(self.info['Iq']):
240	return PyKernel(self.info['Iq'], self.info, input)
241
242	if self.dtype != input.dtype:
243	raise TypeError("data and kernel have different types")
244	if self.program is None:
245	self.program = environment().compile_program(self.info['name'],self.source, self.dtype)
246	kernel_name = generate.kernel_name(self.info, input.is_2D)
247	kernel = getattr(self.program, kernel_name)
248	return GpuKernel(kernel, self.info, input)
249
250	def release(self):
251	if self.program is not None:
252	environment().release_program(self.info['name'])
253	self.program = None
254
255	def make_input(self, q_vectors):
256	"""
257	Make q input vectors available to the model.
258
259	Note that each model needs its own q vector even if the case of
260	mixture models because some models may be OpenCL, some may be
261	ctypes and some may be pure python.
262	"""
263	# Support pure python kernel call
264	if len(q_vectors) == 1 and callable(self.info['Iq']):
265	return PyInput(q_vectors, dtype=self.dtype)
266	elif callable(self.info['Iqxy']):
267	return PyInput(q_vectors, dtype=self.dtype)
268	else:
269	return GpuInput(q_vectors, dtype=self.dtype)
270
271	# TODO: check that we don't need a destructor for buffers which go out of scope
272	class GpuInput(object):
273	"""
274	Make q data available to the gpu.
275
276	q_vectors is a list of q vectors, which will be [q] for 1-D data,
277	and [qx, qy] for 2-D data. Internally, the vectors will be reallocated
278	to get the best performance on OpenCL, which may involve shifting and
279	stretching the array to better match the memory architecture. Additional
280	points will be evaluated with q=1e-3.
281
282	dtype is the data type for the q vectors. The data type should be
283	set to match that of the kernel, which is an attribute of
284	:class:`GpuProgram`. Note that not all kernels support double
285	precision, so even if the program was created for double precision,
286	the GpuProgram.dtype may be single precision.
287
288	Call :meth:`release` when complete. Even if not called directly, the
289	buffer will be released when the data object is freed.
290	"""
291	def __init__(self, q_vectors, dtype=generate.F32):
292	env = environment()
293	self.nq = q_vectors[0].size
294	self.dtype = np.dtype(dtype)
295	self.is_2D = (len(q_vectors) == 2)
296	# TODO: stretch input based on get_warp()
297	# not doing it now since warp depends on kernel, which is not known
298	# at this point, so instead using 32, which is good on the set of
299	# architectures tested so far.
300	self.q_vectors = [_stretch_input(q, self.dtype, 32) for q in q_vectors]
301	self.q_buffers = [
302	cl.Buffer(env.context, mf.READ_ONLY \| mf.COPY_HOST_PTR, hostbuf=q)
303	for q in self.q_vectors
304	]
305	self.global_size = [self.q_vectors[0].size]
306
307	def release(self):
308	for b in self.q_buffers:
309	b.release()
310	self.q_buffers = []
311
312	class GpuKernel(object):
313	"""
314	Callable SAS kernel.
315
316	kernel is the GpuKernel object to call.
317
318	info is the module information
319
320	input is the DllInput q vectors at which the kernel should be
321	evaluated.
322
323	The resulting call method takes the pars, a list of values for
324	the fixed parameters to the kernel, and pd_pars, a list of (value,weight)
325	vectors for the polydisperse parameters. cutoff determines the
326	integration limits: any points with combined weight less than cutoff
327	will not be calculated.
328
329	Call :meth:`release` when done with the kernel instance.
330	"""
331	def __init__(self, kernel, info, input):
332	self.input = input
333	self.kernel = kernel
334	self.info = info
335	self.res = np.empty(input.nq, input.dtype)
336	dim = '2d' if input.is_2D else '1d'
337	self.fixed_pars = info['partype']['fixed-'+dim]
338	self.pd_pars = info['partype']['pd-'+dim]
339
340	# Inputs and outputs for each kernel call
341	# Note: res may be shorter than res_b if global_size != nq
342	env = environment()
343	self.loops_b = [cl.Buffer(env.context, mf.READ_WRITE,
344	2MAX_LOOPSinput.dtype.itemsize)
345	for _ in env.queues]
346	self.res_b = [cl.Buffer(env.context, mf.READ_WRITE,
347	input.global_size[0]*input.dtype.itemsize)
348	for _ in env.queues]
349
350
351	def __call__(self, pars, pd_pars, cutoff=1e-5):
352	real = np.float32 if self.input.dtype == generate.F32 else np.float64
353	fixed = [real(p) for p in pars]
354	cutoff = real(cutoff)
355	loops = np.hstack(pd_pars)
356	loops = np.ascontiguousarray(loops.T, self.input.dtype).flatten()
357	Nloops = [np.uint32(len(p[0])) for p in pd_pars]
358	#print "loops",Nloops, loops
359
360	#import sys; print >>sys.stderr,"opencl eval",pars
361	#print "opencl eval",pars
362	if len(loops) > 2*MAX_LOOPS:
363	raise ValueError("too many polydispersity points")
364	device_num = 0
365	res_bi = self.res_b[device_num]
366	queuei = environment().queues[device_num]
367	loops_bi = self.loops_b[device_num]
368	loops_l = cl.LocalMemory(len(loops.data))
369	cl.enqueue_copy(queuei, loops_bi, loops)
370	#ctx = environment().context
371	#loops_bi = cl.Buffer(ctx, mf.READ_ONLY \| mf.COPY_HOST_PTR, hostbuf=loops)
372	args = self.input.q_buffers + [res_bi,loops_bi,loops_l,cutoff] + fixed + Nloops
373	self.kernel(queuei, self.input.global_size, None, *args)
374	cl.enqueue_copy(queuei, self.res, res_bi)
375
376	return self.res
377
378	def release(self):
379	for b in self.loops_b:
380	b.release()
381	self.loops_b = []
382	for b in self.res_b:
383	b.release()
384	self.res_b = []
385
386	def __del__(self):
387	self.release()

Note: See TracBrowser for help on using the repository browser.

SasView

source: sasmodels/sasmodels/kernelcl.py @ 676351f

Download in other formats: