gpu.py @ f4cf580

core_shell_microgelscostrafo411magnetic_modelrelease_v0.94release_v0.95ticket-1257-vesicle-productticket_1156ticket_1265_superballticket_822_more_unit_tests

Last change on this file since f4cf580 was f4cf580, checked in by Paul Kienzle <pkienzle@…>, 10 years ago
resolve remaining differences between sasview and sasmodels
Property mode set to `100644`
File size: 12.1 KB

Line
1	"""
2	GPU support through OpenCL
3
4	There should be a single GPU environment running on the system. This
5	environment is constructed on the first call to :func:`env`, and the
6	same environment is returned on each call.
7
8	After retrieving the environment, the next step is to create the kernel.
9	This is done with a call to :meth:`GpuEnvironment.make_kernel`, which
10	returns the type of data used by the kernel.
11
12	Next a :class:`GpuData` object should be created with the correct kind
13	of data. This data object can be used by multiple kernels, for example,
14	if the target model is a weighted sum of multiple kernels. The data
15	should include any extra evaluation points required to compute the proper
16	data smearing. This need not match the square grid for 2D data if there
17	is an index saying which q points are active.
18
19	Together the GpuData, the program, and a device form a :class:`GpuKernel`.
20	This kernel is used during fitting, receiving new sets of parameters and
21	evaluating them. The output value is stored in an output buffer on the
22	devices, where it can be combined with other structure factors and form
23	factors and have instrumental resolution effects applied.
24	"""
25	import numpy as np
26	import pyopencl as cl
27	from pyopencl import mem_flags as mf
28
29	from . import gen
30
31	F32_DEFS = """\
32	#define REAL(x) (x##f)
33	#define real float
34	"""
35
36	F64_DEFS = """\
37	#pragma OPENCL EXTENSION cl_khr_fp64: enable
38	#define REAL(x) (x)
39	#define real double
40	"""
41
42	# The max loops number is limited by the amount of local memory available
43	# on the device. You don't want to make this value too big because it will
44	# waste resources, nor too small because it may interfere with users trying
45	# to do their polydispersity calculations. A value of 1024 should be much
46	# larger than necessary given that cost grows as npts^k where k is the number
47	# of polydisperse parameters.
48	MAX_LOOPS = 2048
49
50	def load_model(kernel_module, dtype="single"):
51	"""
52	Load the OpenCL model defined by kernel_module.
53
54	Access to the OpenCL device is delayed until the kernel is called
55	so models can be defined without using too many resources.
56	"""
57	source, info = gen.make(kernel_module)
58	## for debugging, save source to a .cl file, edit it, and reload as model
59	#open(info['name']+'.cl','w').write(source)
60	#source = open(info['name']+'.cl','r').read()
61	return GpuModel(source, info, dtype)
62
63	ENV = None
64	def environment():
65	"""
66	Returns a singleton :class:`GpuEnvironment`.
67
68	This provides an OpenCL context and one queue per device.
69	"""
70	global ENV
71	if ENV is None:
72	ENV = GpuEnvironment()
73	return ENV
74
75	def has_double(device):
76	"""
77	Return true if device supports double precision.
78	"""
79	return "cl_khr_fp64" in device.extensions
80
81
82	def _stretch_input(vector, dtype, extra=1e-3, boundary=128):
83	"""
84	Stretch an input vector to the correct boundary.
85
86	Performance on the kernels can drop by a factor of two or more if the
87	number of values to compute does not fall on a nice power of two
88	boundary. A good choice for the boundary value is the
89	min_data_type_align_size property of the OpenCL device. The usual
90	value of 128 gives a working size as a multiple of 32. The trailing
91	additional vector elements are given a value of extra, and so
92	f(extra) will be computed for each of them. The returned array
93	will thus be a subset of the computed array.
94	"""
95	boundary // dtype.itemsize
96	remainder = vector.size%boundary
97	size = vector.size + (boundary - remainder if remainder != 0 else 0)
98	if size != vector.size:
99	vector = np.hstack((vector, [extra]*(size-vector.size)))
100	return np.ascontiguousarray(vector, dtype=dtype)
101
102
103	def compile_model(context, source, dtype):
104	"""
105	Build a model to run on the gpu.
106
107	Returns the compiled program and its type. The returned type will
108	be float32 even if the desired type is float64 if any of the
109	devices in the context do not support the cl_khr_fp64 extension.
110	"""
111	dtype = np.dtype(dtype)
112	if dtype==gen.F64 and not all(has_double(d) for d in context.devices):
113	raise RuntimeError("Double precision not supported for devices")
114
115	header = F64_DEFS if dtype == gen.F64 else F32_DEFS
116	# Note: USE_SINCOS makes the intel cpu slower under opencl
117	if context.devices[0].type == cl.device_type.GPU:
118	header += "#define USE_SINCOS\n"
119	program = cl.Program(context, header+source).build()
120	return program
121
122
123	def make_result(self, size):
124	self.res = np.empty(size, dtype=self.dtype)
125	self.res_b = cl.Buffer(self.program.context, mf.READ_WRITE, self.res.nbytes)
126	return self.res, self.res_b
127
128
129	# for now, this returns one device in the context
130	# TODO: create a context that contains all devices on all platforms
131	class GpuEnvironment(object):
132	"""
133	GPU context, with possibly many devices, and one queue per device.
134	"""
135	def __init__(self):
136	self.context = cl.create_some_context()
137	self.queues = [cl.CommandQueue(self.context, d)
138	for d in self.context.devices]
139	self.boundary = max(d.min_data_type_align_size
140	for d in self.context.devices)
141	self.has_double = all(has_double(d) for d in self.context.devices)
142	self.compiled = {}
143
144	def compile_program(self, name, source, dtype):
145	if name not in self.compiled:
146	#print "compiling",name
147	self.compiled[name] = compile_model(self.context, source, dtype)
148	return self.compiled[name]
149
150	def release_program(self, name):
151	if name in self.compiled:
152	self.compiled[name].release()
153	del self.compiled[name]
154
155	class GpuModel(object):
156	"""
157	GPU wrapper for a single model.
158
159	source and info are the model source and interface as returned
160	from :func:`gen.make`.
161
162	dtype is the desired model precision. Any numpy dtype for single
163	or double precision floats will do, such as 'f', 'float32' or 'single'
164	for single and 'd', 'float64' or 'double' for double. Double precision
165	is an optional extension which may not be available on all devices.
166	"""
167	def __init__(self, source, info, dtype=gen.F32):
168	self.info = info
169	self.source = source
170	self.dtype = dtype
171	self.program = None # delay program creation
172
173	def __getstate__(self):
174	state = self.__dict__.copy()
175	state['program'] = None
176	return state
177
178	def __setstate__(self, state):
179	self.__dict__ = state.copy()
180
181	def __call__(self, input):
182	if self.dtype != input.dtype:
183	raise TypeError("data and kernel have different types")
184	if self.program is None:
185	self.program = environment().compile_program(self.info['name'],self.source, self.dtype)
186	kernel_name = gen.kernel_name(self.info, input.is_2D)
187	kernel = getattr(self.program, kernel_name)
188	return GpuKernel(kernel, self.info, input)
189
190	def release(self):
191	if self.program is not None:
192	environment().release_program(self.info['name'])
193	self.program = None
194
195	def make_input(self, q_vectors):
196	"""
197	Make q input vectors available to the model.
198
199	This only needs to be done once for all models that operate on the
200	same input. So for example, if you are adding two different models
201	together to compare to a data set, then only one model needs to
202	needs to call make_input, so long as the models have the same dtype.
203	"""
204	# Note: the weird interface, where make_input doesn't care which
205	# model calls it, allows us to ask the model to define the data
206	# and the caller does not need to know if it is opencl or ctypes.
207	# The returned data object is opaque.
208	return GpuInput(q_vectors, dtype=self.dtype)
209
210	# TODO: check that we don't need a destructor for buffers which go out of scope
211	class GpuInput(object):
212	"""
213	Make q data available to the gpu.
214
215	q_vectors is a list of q vectors, which will be [q] for 1-D data,
216	and [qx, qy] for 2-D data. Internally, the vectors will be reallocated
217	to get the best performance on OpenCL, which may involve shifting and
218	stretching the array to better match the memory architecture. Additional
219	points will be evaluated with q=1e-3.
220
221	dtype is the data type for the q vectors. The data type should be
222	set to match that of the kernel, which is an attribute of
223	:class:`GpuProgram`. Note that not all kernels support double
224	precision, so even if the program was created for double precision,
225	the GpuProgram.dtype may be single precision.
226
227	Call :meth:`release` when complete. Even if not called directly, the
228	buffer will be released when the data object is freed.
229	"""
230	def __init__(self, q_vectors, dtype=gen.F32):
231	env = environment()
232	self.nq = q_vectors[0].size
233	self.dtype = np.dtype(dtype)
234	self.is_2D = (len(q_vectors) == 2)
235	self.q_vectors = [
236	_stretch_input(q, self.dtype, boundary=env.boundary)
237	for q in q_vectors
238	]
239	self.q_buffers = [
240	cl.Buffer(env.context, mf.READ_ONLY \| mf.COPY_HOST_PTR, hostbuf=q)
241	for q in self.q_vectors
242	]
243	self.global_size = [self.q_vectors[0].size]
244
245	def release(self):
246	for b in self.q_buffers:
247	b.release()
248	self.q_buffers = []
249
250	class GpuKernel(object):
251	"""
252	Callable SAS kernel.
253
254	kernel is the GpuKernel object to call.
255
256	info is the module information
257
258	input is the DllInput q vectors at which the kernel should be
259	evaluated.
260
261	The resulting call method takes the pars, a list of values for
262	the fixed parameters to the kernel, and pd_pars, a list of (value,weight)
263	vectors for the polydisperse parameters. cutoff determines the
264	integration limits: any points with combined weight less than cutoff
265	will not be calculated.
266
267	Call :meth:`release` when done with the kernel instance.
268	"""
269	def __init__(self, kernel, info, input):
270	self.input = input
271	self.kernel = kernel
272	self.info = info
273	self.res = np.empty(input.nq, input.dtype)
274	dim = '2d' if input.is_2D else '1d'
275	self.fixed_pars = info['partype']['fixed-'+dim]
276	self.pd_pars = info['partype']['pd-'+dim]
277
278	# Inputs and outputs for each kernel call
279	# Note: res may be shorter than res_b if global_size != nq
280	env = environment()
281	self.loops_b = [cl.Buffer(env.context, mf.READ_WRITE,
282	2MAX_LOOPSinput.dtype.itemsize)
283	for _ in env.queues]
284	self.res_b = [cl.Buffer(env.context, mf.READ_WRITE,
285	input.global_size[0]*input.dtype.itemsize)
286	for _ in env.queues]
287
288
289	def __call__(self, pars, pd_pars, cutoff=1e-5):
290	real = np.float32 if self.input.dtype == gen.F32 else np.float64
291	fixed = [real(p) for p in pars]
292	cutoff = real(cutoff)
293	loops = np.hstack(pd_pars)
294	loops = np.ascontiguousarray(loops.T, self.input.dtype).flatten()
295	Nloops = [np.uint32(len(p[0])) for p in pd_pars]
296	#print "loops",Nloops, loops
297
298	#import sys; print >>sys.stderr,"opencl eval",pars
299	#print "opencl eval",pars
300	if len(loops) > 2*MAX_LOOPS:
301	raise ValueError("too many polydispersity points")
302	device_num = 0
303	res_bi = self.res_b[device_num]
304	queuei = environment().queues[device_num]
305	loops_bi = self.loops_b[device_num]
306	loops_l = cl.LocalMemory(len(loops.data))
307	cl.enqueue_copy(queuei, loops_bi, loops)
308	#ctx = environment().context
309	#loops_bi = cl.Buffer(ctx, mf.READ_ONLY \| mf.COPY_HOST_PTR, hostbuf=loops)
310	args = self.input.q_buffers + [res_bi,loops_bi,loops_l,cutoff] + fixed + Nloops
311	self.kernel(queuei, self.input.global_size, None, *args)
312	cl.enqueue_copy(queuei, self.res, res_bi)
313
314	return self.res
315
316	def release(self):
317	for b in self.loops_b:
318	b.release()
319	self.loops_b = []
320	for b in self.res_b:
321	b.release()
322	self.res_b = []
323
324	def __del__(self):
325	self.release()

Note: See TracBrowser for help on using the repository browser.

SasView

source: sasmodels/sasmodels/gpu.py @ f4cf580

Download in other formats: