gpu.py @ d087487b

core_shell_microgelscostrafo411magnetic_modelrelease_v0.94release_v0.95ticket-1257-vesicle-productticket_1156ticket_1265_superballticket_822_more_unit_tests

Last change on this file since d087487b was d087487b, checked in by Paul Kienzle <pkienzle@…>, 9 years ago
remove warning from radeon drivers that fp64 is already enabled
Property mode set to `100644`
File size: 12.1 KB

Line
1	"""
2	GPU support through OpenCL
3
4	There should be a single GPU environment running on the system. This
5	environment is constructed on the first call to :func:`env`, and the
6	same environment is returned on each call.
7
8	After retrieving the environment, the next step is to create the kernel.
9	This is done with a call to :meth:`GpuEnvironment.make_kernel`, which
10	returns the type of data used by the kernel.
11
12	Next a :class:`GpuData` object should be created with the correct kind
13	of data. This data object can be used by multiple kernels, for example,
14	if the target model is a weighted sum of multiple kernels. The data
15	should include any extra evaluation points required to compute the proper
16	data smearing. This need not match the square grid for 2D data if there
17	is an index saying which q points are active.
18
19	Together the GpuData, the program, and a device form a :class:`GpuKernel`.
20	This kernel is used during fitting, receiving new sets of parameters and
21	evaluating them. The output value is stored in an output buffer on the
22	devices, where it can be combined with other structure factors and form
23	factors and have instrumental resolution effects applied.
24	"""
25	import numpy as np
26	import pyopencl as cl
27	from pyopencl import mem_flags as mf
28
29	from . import gen
30
31	F32_DEFS = """\
32	#define REAL(x) (x##f)
33	#define real float
34	"""
35
36
37	F64_DEFS = """\
38	#ifdef cl_khr_fp64
39	# pragma OPENCL EXTENSION cl_khr_fp64: enable
40	#endif
41	#define REAL(x) (x)
42	#define real double
43	"""
44
45	# The max loops number is limited by the amount of local memory available
46	# on the device. You don't want to make this value too big because it will
47	# waste resources, nor too small because it may interfere with users trying
48	# to do their polydispersity calculations. A value of 1024 should be much
49	# larger than necessary given that cost grows as npts^k where k is the number
50	# of polydisperse parameters.
51	MAX_LOOPS = 2048
52
53	def load_model(kernel_module, dtype="single"):
54	"""
55	Load the OpenCL model defined by kernel_module.
56
57	Access to the OpenCL device is delayed until the kernel is called
58	so models can be defined without using too many resources.
59	"""
60	source, info = gen.make(kernel_module)
61	## for debugging, save source to a .cl file, edit it, and reload as model
62	#open(info['name']+'.cl','w').write(source)
63	#source = open(info['name']+'.cl','r').read()
64	return GpuModel(source, info, dtype)
65
66	ENV = None
67	def environment():
68	"""
69	Returns a singleton :class:`GpuEnvironment`.
70
71	This provides an OpenCL context and one queue per device.
72	"""
73	global ENV
74	if ENV is None:
75	ENV = GpuEnvironment()
76	return ENV
77
78	def has_double(device):
79	"""
80	Return true if device supports double precision.
81	"""
82	return "cl_khr_fp64" in device.extensions
83
84
85	def _stretch_input(vector, dtype, extra=1e-3, boundary=128):
86	"""
87	Stretch an input vector to the correct boundary.
88
89	Performance on the kernels can drop by a factor of two or more if the
90	number of values to compute does not fall on a nice power of two
91	boundary. A good choice for the boundary value is the
92	min_data_type_align_size property of the OpenCL device. The usual
93	value of 128 gives a working size as a multiple of 32. The trailing
94	additional vector elements are given a value of extra, and so
95	f(extra) will be computed for each of them. The returned array
96	will thus be a subset of the computed array.
97	"""
98	boundary // dtype.itemsize
99	remainder = vector.size%boundary
100	size = vector.size + (boundary - remainder if remainder != 0 else 0)
101	if size != vector.size:
102	vector = np.hstack((vector, [extra]*(size-vector.size)))
103	return np.ascontiguousarray(vector, dtype=dtype)
104
105
106	def compile_model(context, source, dtype):
107	"""
108	Build a model to run on the gpu.
109
110	Returns the compiled program and its type. The returned type will
111	be float32 even if the desired type is float64 if any of the
112	devices in the context do not support the cl_khr_fp64 extension.
113	"""
114	dtype = np.dtype(dtype)
115	if dtype==gen.F64 and not all(has_double(d) for d in context.devices):
116	raise RuntimeError("Double precision not supported for devices")
117
118	header = F64_DEFS if dtype == gen.F64 else F32_DEFS
119	# Note: USE_SINCOS makes the intel cpu slower under opencl
120	if context.devices[0].type == cl.device_type.GPU:
121	header += "#define USE_SINCOS\n"
122	program = cl.Program(context, header+source).build()
123	return program
124
125
126	def make_result(self, size):
127	self.res = np.empty(size, dtype=self.dtype)
128	self.res_b = cl.Buffer(self.program.context, mf.READ_WRITE, self.res.nbytes)
129	return self.res, self.res_b
130
131
132	# for now, this returns one device in the context
133	# TODO: create a context that contains all devices on all platforms
134	class GpuEnvironment(object):
135	"""
136	GPU context, with possibly many devices, and one queue per device.
137	"""
138	def __init__(self):
139	self.context = cl.create_some_context()
140	self.queues = [cl.CommandQueue(self.context, d)
141	for d in self.context.devices]
142	self.boundary = max(d.min_data_type_align_size
143	for d in self.context.devices)
144	self.has_double = all(has_double(d) for d in self.context.devices)
145	self.compiled = {}
146
147	def compile_program(self, name, source, dtype):
148	if name not in self.compiled:
149	#print "compiling",name
150	self.compiled[name] = compile_model(self.context, source, dtype)
151	return self.compiled[name]
152
153	def release_program(self, name):
154	if name in self.compiled:
155	self.compiled[name].release()
156	del self.compiled[name]
157
158	class GpuModel(object):
159	"""
160	GPU wrapper for a single model.
161
162	source and info are the model source and interface as returned
163	from :func:`gen.make`.
164
165	dtype is the desired model precision. Any numpy dtype for single
166	or double precision floats will do, such as 'f', 'float32' or 'single'
167	for single and 'd', 'float64' or 'double' for double. Double precision
168	is an optional extension which may not be available on all devices.
169	"""
170	def __init__(self, source, info, dtype=gen.F32):
171	self.info = info
172	self.source = source
173	self.dtype = dtype
174	self.program = None # delay program creation
175
176	def __getstate__(self):
177	state = self.__dict__.copy()
178	state['program'] = None
179	return state
180
181	def __setstate__(self, state):
182	self.__dict__ = state.copy()
183
184	def __call__(self, input):
185	if self.dtype != input.dtype:
186	raise TypeError("data and kernel have different types")
187	if self.program is None:
188	self.program = environment().compile_program(self.info['name'],self.source, self.dtype)
189	kernel_name = gen.kernel_name(self.info, input.is_2D)
190	kernel = getattr(self.program, kernel_name)
191	return GpuKernel(kernel, self.info, input)
192
193	def release(self):
194	if self.program is not None:
195	environment().release_program(self.info['name'])
196	self.program = None
197
198	def make_input(self, q_vectors):
199	"""
200	Make q input vectors available to the model.
201
202	This only needs to be done once for all models that operate on the
203	same input. So for example, if you are adding two different models
204	together to compare to a data set, then only one model needs to
205	needs to call make_input, so long as the models have the same dtype.
206	"""
207	# Note: the weird interface, where make_input doesn't care which
208	# model calls it, allows us to ask the model to define the data
209	# and the caller does not need to know if it is opencl or ctypes.
210	# The returned data object is opaque.
211	return GpuInput(q_vectors, dtype=self.dtype)
212
213	# TODO: check that we don't need a destructor for buffers which go out of scope
214	class GpuInput(object):
215	"""
216	Make q data available to the gpu.
217
218	q_vectors is a list of q vectors, which will be [q] for 1-D data,
219	and [qx, qy] for 2-D data. Internally, the vectors will be reallocated
220	to get the best performance on OpenCL, which may involve shifting and
221	stretching the array to better match the memory architecture. Additional
222	points will be evaluated with q=1e-3.
223
224	dtype is the data type for the q vectors. The data type should be
225	set to match that of the kernel, which is an attribute of
226	:class:`GpuProgram`. Note that not all kernels support double
227	precision, so even if the program was created for double precision,
228	the GpuProgram.dtype may be single precision.
229
230	Call :meth:`release` when complete. Even if not called directly, the
231	buffer will be released when the data object is freed.
232	"""
233	def __init__(self, q_vectors, dtype=gen.F32):
234	env = environment()
235	self.nq = q_vectors[0].size
236	self.dtype = np.dtype(dtype)
237	self.is_2D = (len(q_vectors) == 2)
238	self.q_vectors = [
239	_stretch_input(q, self.dtype, boundary=env.boundary)
240	for q in q_vectors
241	]
242	self.q_buffers = [
243	cl.Buffer(env.context, mf.READ_ONLY \| mf.COPY_HOST_PTR, hostbuf=q)
244	for q in self.q_vectors
245	]
246	self.global_size = [self.q_vectors[0].size]
247
248	def release(self):
249	for b in self.q_buffers:
250	b.release()
251	self.q_buffers = []
252
253	class GpuKernel(object):
254	"""
255	Callable SAS kernel.
256
257	kernel is the GpuKernel object to call.
258
259	info is the module information
260
261	input is the DllInput q vectors at which the kernel should be
262	evaluated.
263
264	The resulting call method takes the pars, a list of values for
265	the fixed parameters to the kernel, and pd_pars, a list of (value,weight)
266	vectors for the polydisperse parameters. cutoff determines the
267	integration limits: any points with combined weight less than cutoff
268	will not be calculated.
269
270	Call :meth:`release` when done with the kernel instance.
271	"""
272	def __init__(self, kernel, info, input):
273	self.input = input
274	self.kernel = kernel
275	self.info = info
276	self.res = np.empty(input.nq, input.dtype)
277	dim = '2d' if input.is_2D else '1d'
278	self.fixed_pars = info['partype']['fixed-'+dim]
279	self.pd_pars = info['partype']['pd-'+dim]
280
281	# Inputs and outputs for each kernel call
282	# Note: res may be shorter than res_b if global_size != nq
283	env = environment()
284	self.loops_b = [cl.Buffer(env.context, mf.READ_WRITE,
285	2MAX_LOOPSinput.dtype.itemsize)
286	for _ in env.queues]
287	self.res_b = [cl.Buffer(env.context, mf.READ_WRITE,
288	input.global_size[0]*input.dtype.itemsize)
289	for _ in env.queues]
290
291
292	def __call__(self, pars, pd_pars, cutoff=1e-5):
293	real = np.float32 if self.input.dtype == gen.F32 else np.float64
294	fixed = [real(p) for p in pars]
295	cutoff = real(cutoff)
296	loops = np.hstack(pd_pars)
297	loops = np.ascontiguousarray(loops.T, self.input.dtype).flatten()
298	Nloops = [np.uint32(len(p[0])) for p in pd_pars]
299	#print "loops",Nloops, loops
300
301	#import sys; print >>sys.stderr,"opencl eval",pars
302	#print "opencl eval",pars
303	if len(loops) > 2*MAX_LOOPS:
304	raise ValueError("too many polydispersity points")
305	device_num = 0
306	res_bi = self.res_b[device_num]
307	queuei = environment().queues[device_num]
308	loops_bi = self.loops_b[device_num]
309	loops_l = cl.LocalMemory(len(loops.data))
310	cl.enqueue_copy(queuei, loops_bi, loops)
311	#ctx = environment().context
312	#loops_bi = cl.Buffer(ctx, mf.READ_ONLY \| mf.COPY_HOST_PTR, hostbuf=loops)
313	args = self.input.q_buffers + [res_bi,loops_bi,loops_l,cutoff] + fixed + Nloops
314	self.kernel(queuei, self.input.global_size, None, *args)
315	cl.enqueue_copy(queuei, self.res, res_bi)
316
317	return self.res
318
319	def release(self):
320	for b in self.loops_b:
321	b.release()
322	self.loops_b = []
323	for b in self.res_b:
324	b.release()
325	self.res_b = []
326
327	def __del__(self):
328	self.release()

Note: See TracBrowser for help on using the repository browser.

SasView

source: sasmodels/sasmodels/gpu.py @ d087487b

Download in other formats: