kernelcl.py @ da32ec3

core_shell_microgelscostrafo411magnetic_modelrelease_v0.94release_v0.95ticket-1257-vesicle-productticket_1156ticket_1265_superballticket_822_more_unit_tests

Last change on this file since da32ec3 was da32ec3, checked in by ajj, 10 years ago
fix importerror
Property mode set to `100644`
File size: 13.9 KB

Line
1	"""
2	GPU support through OpenCL
3
4	There should be a single GPU environment running on the system. This
5	environment is constructed on the first call to :func:`env`, and the
6	same environment is returned on each call.
7
8	After retrieving the environment, the next step is to create the kernel.
9	This is done with a call to :meth:`GpuEnvironment.make_kernel`, which
10	returns the type of data used by the kernel.
11
12	Next a :class:`GpuData` object should be created with the correct kind
13	of data. This data object can be used by multiple kernels, for example,
14	if the target model is a weighted sum of multiple kernels. The data
15	should include any extra evaluation points required to compute the proper
16	data smearing. This need not match the square grid for 2D data if there
17	is an index saying which q points are active.
18
19	Together the GpuData, the program, and a device form a :class:`GpuKernel`.
20	This kernel is used during fitting, receiving new sets of parameters and
21	evaluating them. The output value is stored in an output buffer on the
22	devices, where it can be combined with other structure factors and form
23	factors and have instrumental resolution effects applied.
24	"""
25	import os
26	import warnings
27
28	import numpy as np
29
30	try:
31	import pyopencl as cl
32	except ImportError,exc:
33	warnings.warn(str(exc))
34	raise RuntimeError("OpenCL not available")
35
36	from pyopencl import mem_flags as mf
37
38	from . import generate
39	from .kernelpy import PyInput, PyKernel
40
41	F64_DEFS = """\
42	#ifdef cl_khr_fp64
43	# pragma OPENCL EXTENSION cl_khr_fp64: enable
44	#endif
45	"""
46
47	# The max loops number is limited by the amount of local memory available
48	# on the device. You don't want to make this value too big because it will
49	# waste resources, nor too small because it may interfere with users trying
50	# to do their polydispersity calculations. A value of 1024 should be much
51	# larger than necessary given that cost grows as npts^k where k is the number
52	# of polydisperse parameters.
53	MAX_LOOPS = 2048
54
55	def load_model(kernel_module, dtype="single"):
56	"""
57	Load the OpenCL model defined by kernel_module.
58
59	Access to the OpenCL device is delayed until the kernel is called
60	so models can be defined without using too many resources.
61	"""
62	source, info = generate.make(kernel_module)
63	## for debugging, save source to a .cl file, edit it, and reload as model
64	#open(info['name']+'.cl','w').write(source)
65	#source = open(info['name']+'.cl','r').read()
66	return GpuModel(source, info, dtype)
67
68	ENV = None
69	def environment():
70	"""
71	Returns a singleton :class:`GpuEnvironment`.
72
73	This provides an OpenCL context and one queue per device.
74	"""
75	global ENV
76	if ENV is None:
77	ENV = GpuEnvironment()
78	return ENV
79
80	def has_double(device):
81	"""
82	Return true if device supports double precision.
83	"""
84	return "cl_khr_fp64" in device.extensions
85
86	def get_warp(kernel, queue):
87	"""
88	Return the size of an execution batch for kernel running on queue.
89	"""
90	return kernel.get_work_group_info(cl.kernel_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
91	queue.device)
92
93	def _stretch_input(vector, dtype, extra=1e-3, boundary=32):
94	"""
95	Stretch an input vector to the correct boundary.
96
97	Performance on the kernels can drop by a factor of two or more if the
98	number of values to compute does not fall on a nice power of two
99	boundary. The trailing additional vector elements are given a
100	value of extra, and so f(extra) will be computed for each of
101	them. The returned array will thus be a subset of the computed array.
102
103	boundary should be a power of 2 which is at least 32 for good
104	performance on current platforms (as of Jan 2015). It should
105	probably be the max of get_warp(kernel,queue) and
106	device.min_data_type_align_size//4.
107	"""
108	remainder = vector.size%boundary
109	if remainder != 0:
110	size = vector.size + (boundary - remainder)
111	vector = np.hstack((vector, [extra]*(size-vector.size)))
112	return np.ascontiguousarray(vector, dtype=dtype)
113
114
115	def compile_model(context, source, dtype):
116	"""
117	Build a model to run on the gpu.
118
119	Returns the compiled program and its type. The returned type will
120	be float32 even if the desired type is float64 if any of the
121	devices in the context do not support the cl_khr_fp64 extension.
122	"""
123	dtype = np.dtype(dtype)
124	if dtype==generate.F64 and not all(has_double(d) for d in context.devices):
125	raise RuntimeError("Double precision not supported for devices")
126
127	header = F64_DEFS if dtype == generate.F64 else ""
128	if dtype == generate.F32:
129	source = generate.use_single(source)
130	# Note: USE_SINCOS makes the intel cpu slower under opencl
131	if context.devices[0].type == cl.device_type.GPU:
132	header += "#define USE_SINCOS\n"
133	program = cl.Program(context, header+source).build()
134	return program
135
136
137	def make_result(self, size):
138	self.res = np.empty(size, dtype=self.dtype)
139	self.res_b = cl.Buffer(self.program.context, mf.READ_WRITE, self.res.nbytes)
140	return self.res, self.res_b
141
142
143	# for now, this returns one device in the context
144	# TODO: create a context that contains all devices on all platforms
145	class GpuEnvironment(object):
146	"""
147	GPU context, with possibly many devices, and one queue per device.
148	"""
149	def __init__(self):
150	# find gpu context
151	#self.context = cl.create_some_context()
152
153	self.context = None
154	if 'PYOPENCL_CTX' in os.environ:
155	self._create_some_context()
156
157	if not self.context:
158	self.context = self._find_context()
159
160	# Byte boundary for data alignment
161	#self.data_boundary = max(d.min_data_type_align_size
162	# for d in self.context.devices)
163	self.queues = [cl.CommandQueue(self.context, d)
164	for d in self.context.devices]
165	self.has_double = all(has_double(d) for d in self.context.devices)
166	self.compiled = {}
167
168	def _create_some_context(self):
169	try:
170	self.context = cl.create_some_context(interactive=False)
171	except Exception,exc:
172	warnings.warn(str(exc))
173	warnings.warn("pyopencl.create_some_context() failed")
174	warnings.warn("the environment variable 'PYOPENCL_CTX' might not be set correctly")
175
176	def _find_context(self):
177	default = None
178	for platform in cl.get_platforms():
179	for device in platform.get_devices():
180	if device.type == cl.device_type.GPU:
181	return cl.Context([device])
182	if default is None:
183	default = device
184
185	if not default:
186	raise RuntimeError("OpenCL device not found")
187
188	return cl.Context([default])
189
190	def compile_program(self, name, source, dtype):
191	if name not in self.compiled:
192	#print "compiling",name
193	self.compiled[name] = compile_model(self.context, source, dtype)
194	return self.compiled[name]
195
196	def release_program(self, name):
197	if name in self.compiled:
198	self.compiled[name].release()
199	del self.compiled[name]
200
201
202	class GpuModel(object):
203	"""
204	GPU wrapper for a single model.
205
206	source and info are the model source and interface as returned
207	from :func:`gen.make`.
208
209	dtype is the desired model precision. Any numpy dtype for single
210	or double precision floats will do, such as 'f', 'float32' or 'single'
211	for single and 'd', 'float64' or 'double' for double. Double precision
212	is an optional extension which may not be available on all devices.
213	"""
214	def __init__(self, source, info, dtype=generate.F32):
215	self.info = info
216	self.source = source
217	self.dtype = dtype
218	self.program = None # delay program creation
219
220	def __getstate__(self):
221	state = self.__dict__.copy()
222	state['program'] = None
223	return state
224
225	def __setstate__(self, state):
226	self.__dict__ = state.copy()
227
228	def __call__(self, input):
229	# Support pure python kernel call
230	if input.is_2D and callable(self.info['Iqxy']):
231	return PyKernel(self.info['Iqxy'], self.info, input)
232	elif not input.is_2D and callable(self.info['Iq']):
233	return PyKernel(self.info['Iq'], self.info, input)
234
235	if self.dtype != input.dtype:
236	raise TypeError("data and kernel have different types")
237	if self.program is None:
238	self.program = environment().compile_program(self.info['name'],self.source, self.dtype)
239	kernel_name = generate.kernel_name(self.info, input.is_2D)
240	kernel = getattr(self.program, kernel_name)
241	return GpuKernel(kernel, self.info, input)
242
243	def release(self):
244	if self.program is not None:
245	environment().release_program(self.info['name'])
246	self.program = None
247
248	def make_input(self, q_vectors):
249	"""
250	Make q input vectors available to the model.
251
252	Note that each model needs its own q vector even if the case of
253	mixture models because some models may be OpenCL, some may be
254	ctypes and some may be pure python.
255	"""
256	# Support pure python kernel call
257	if len(q_vectors) == 1 and callable(self.info['Iq']):
258	return PyInput(q_vectors, dtype=self.dtype)
259	elif callable(self.info['Iqxy']):
260	return PyInput(q_vectors, dtype=self.dtype)
261	else:
262	return GpuInput(q_vectors, dtype=self.dtype)
263
264	# TODO: check that we don't need a destructor for buffers which go out of scope
265	class GpuInput(object):
266	"""
267	Make q data available to the gpu.
268
269	q_vectors is a list of q vectors, which will be [q] for 1-D data,
270	and [qx, qy] for 2-D data. Internally, the vectors will be reallocated
271	to get the best performance on OpenCL, which may involve shifting and
272	stretching the array to better match the memory architecture. Additional
273	points will be evaluated with q=1e-3.
274
275	dtype is the data type for the q vectors. The data type should be
276	set to match that of the kernel, which is an attribute of
277	:class:`GpuProgram`. Note that not all kernels support double
278	precision, so even if the program was created for double precision,
279	the GpuProgram.dtype may be single precision.
280
281	Call :meth:`release` when complete. Even if not called directly, the
282	buffer will be released when the data object is freed.
283	"""
284	def __init__(self, q_vectors, dtype=generate.F32):
285	env = environment()
286	self.nq = q_vectors[0].size
287	self.dtype = np.dtype(dtype)
288	self.is_2D = (len(q_vectors) == 2)
289	# TODO: stretch input based on get_warp()
290	# not doing it now since warp depends on kernel, which is not known
291	# at this point, so instead using 32, which is good on the set of
292	# architectures tested so far.
293	self.q_vectors = [_stretch_input(q, self.dtype, 32) for q in q_vectors]
294	self.q_buffers = [
295	cl.Buffer(env.context, mf.READ_ONLY \| mf.COPY_HOST_PTR, hostbuf=q)
296	for q in self.q_vectors
297	]
298	self.global_size = [self.q_vectors[0].size]
299
300	def release(self):
301	for b in self.q_buffers:
302	b.release()
303	self.q_buffers = []
304
305	class GpuKernel(object):
306	"""
307	Callable SAS kernel.
308
309	kernel is the GpuKernel object to call.
310
311	info is the module information
312
313	input is the DllInput q vectors at which the kernel should be
314	evaluated.
315
316	The resulting call method takes the pars, a list of values for
317	the fixed parameters to the kernel, and pd_pars, a list of (value,weight)
318	vectors for the polydisperse parameters. cutoff determines the
319	integration limits: any points with combined weight less than cutoff
320	will not be calculated.
321
322	Call :meth:`release` when done with the kernel instance.
323	"""
324	def __init__(self, kernel, info, input):
325	self.input = input
326	self.kernel = kernel
327	self.info = info
328	self.res = np.empty(input.nq, input.dtype)
329	dim = '2d' if input.is_2D else '1d'
330	self.fixed_pars = info['partype']['fixed-'+dim]
331	self.pd_pars = info['partype']['pd-'+dim]
332
333	# Inputs and outputs for each kernel call
334	# Note: res may be shorter than res_b if global_size != nq
335	env = environment()
336	self.loops_b = [cl.Buffer(env.context, mf.READ_WRITE,
337	2MAX_LOOPSinput.dtype.itemsize)
338	for _ in env.queues]
339	self.res_b = [cl.Buffer(env.context, mf.READ_WRITE,
340	input.global_size[0]*input.dtype.itemsize)
341	for _ in env.queues]
342
343
344	def __call__(self, pars, pd_pars, cutoff=1e-5):
345	real = np.float32 if self.input.dtype == generate.F32 else np.float64
346	fixed = [real(p) for p in pars]
347	cutoff = real(cutoff)
348	loops = np.hstack(pd_pars)
349	loops = np.ascontiguousarray(loops.T, self.input.dtype).flatten()
350	Nloops = [np.uint32(len(p[0])) for p in pd_pars]
351	#print "loops",Nloops, loops
352
353	#import sys; print >>sys.stderr,"opencl eval",pars
354	#print "opencl eval",pars
355	if len(loops) > 2*MAX_LOOPS:
356	raise ValueError("too many polydispersity points")
357	device_num = 0
358	res_bi = self.res_b[device_num]
359	queuei = environment().queues[device_num]
360	loops_bi = self.loops_b[device_num]
361	loops_l = cl.LocalMemory(len(loops.data))
362	cl.enqueue_copy(queuei, loops_bi, loops)
363	#ctx = environment().context
364	#loops_bi = cl.Buffer(ctx, mf.READ_ONLY \| mf.COPY_HOST_PTR, hostbuf=loops)
365	args = self.input.q_buffers + [res_bi,loops_bi,loops_l,cutoff] + fixed + Nloops
366	self.kernel(queuei, self.input.global_size, None, *args)
367	cl.enqueue_copy(queuei, self.res, res_bi)
368
369	return self.res
370
371	def release(self):
372	for b in self.loops_b:
373	b.release()
374	self.loops_b = []
375	for b in self.res_b:
376	b.release()
377	self.res_b = []
378
379	def __del__(self):
380	self.release()

Note: See TracBrowser for help on using the repository browser.

SasView

source: sasmodels/sasmodels/kernelcl.py @ da32ec3

Download in other formats: