gpu.py @ b3f6bc3

core_shell_microgelscostrafo411magnetic_modelrelease_v0.94release_v0.95ticket-1257-vesicle-productticket_1156ticket_1265_superballticket_822_more_unit_tests

Last change on this file since b3f6bc3 was b3f6bc3, checked in by pkienzle, 9 years ago
support pure python model Iq/Iqxy?
Property mode set to `100644`
File size: 12.8 KB

Line
1	"""
2	GPU support through OpenCL
3
4	There should be a single GPU environment running on the system. This
5	environment is constructed on the first call to :func:`env`, and the
6	same environment is returned on each call.
7
8	After retrieving the environment, the next step is to create the kernel.
9	This is done with a call to :meth:`GpuEnvironment.make_kernel`, which
10	returns the type of data used by the kernel.
11
12	Next a :class:`GpuData` object should be created with the correct kind
13	of data. This data object can be used by multiple kernels, for example,
14	if the target model is a weighted sum of multiple kernels. The data
15	should include any extra evaluation points required to compute the proper
16	data smearing. This need not match the square grid for 2D data if there
17	is an index saying which q points are active.
18
19	Together the GpuData, the program, and a device form a :class:`GpuKernel`.
20	This kernel is used during fitting, receiving new sets of parameters and
21	evaluating them. The output value is stored in an output buffer on the
22	devices, where it can be combined with other structure factors and form
23	factors and have instrumental resolution effects applied.
24	"""
25	import numpy as np
26
27	import pyopencl as cl
28	from pyopencl import mem_flags as mf
29
30	from . import generate
31	from .pykernel import PyInput, PyKernel
32
33	F64_DEFS = """\
34	#ifdef cl_khr_fp64
35	# pragma OPENCL EXTENSION cl_khr_fp64: enable
36	#endif
37	"""
38
39	# The max loops number is limited by the amount of local memory available
40	# on the device. You don't want to make this value too big because it will
41	# waste resources, nor too small because it may interfere with users trying
42	# to do their polydispersity calculations. A value of 1024 should be much
43	# larger than necessary given that cost grows as npts^k where k is the number
44	# of polydisperse parameters.
45	MAX_LOOPS = 2048
46
47	def load_model(kernel_module, dtype="single"):
48	"""
49	Load the OpenCL model defined by kernel_module.
50
51	Access to the OpenCL device is delayed until the kernel is called
52	so models can be defined without using too many resources.
53	"""
54	source, info = generate.make(kernel_module)
55	## for debugging, save source to a .cl file, edit it, and reload as model
56	#open(info['name']+'.cl','w').write(source)
57	#source = open(info['name']+'.cl','r').read()
58	return GpuModel(source, info, dtype)
59
60	ENV = None
61	def environment():
62	"""
63	Returns a singleton :class:`GpuEnvironment`.
64
65	This provides an OpenCL context and one queue per device.
66	"""
67	global ENV
68	if ENV is None:
69	ENV = GpuEnvironment()
70	return ENV
71
72	def has_double(device):
73	"""
74	Return true if device supports double precision.
75	"""
76	return "cl_khr_fp64" in device.extensions
77
78	def get_warp(kernel, queue):
79	"""
80	Return the size of an execution batch for kernel running on queue.
81	"""
82	return kernel.get_work_group_info(cl.kernel_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
83	queue.device)
84
85	def _stretch_input(vector, dtype, extra=1e-3, boundary=32):
86	"""
87	Stretch an input vector to the correct boundary.
88
89	Performance on the kernels can drop by a factor of two or more if the
90	number of values to compute does not fall on a nice power of two
91	boundary. The trailing additional vector elements are given a
92	value of extra, and so f(extra) will be computed for each of
93	them. The returned array will thus be a subset of the computed array.
94
95	boundary should be a power of 2 which is at least 32 for good
96	performance on current platforms (as of Jan 2015). It should
97	probably be the max of get_warp(kernel,queue) and
98	device.min_data_type_align_size//4.
99	"""
100	remainder = vector.size%boundary
101	if remainder != 0:
102	size = vector.size + (boundary - remainder)
103	vector = np.hstack((vector, [extra]*(size-vector.size)))
104	return np.ascontiguousarray(vector, dtype=dtype)
105
106
107	def compile_model(context, source, dtype):
108	"""
109	Build a model to run on the gpu.
110
111	Returns the compiled program and its type. The returned type will
112	be float32 even if the desired type is float64 if any of the
113	devices in the context do not support the cl_khr_fp64 extension.
114	"""
115	dtype = np.dtype(dtype)
116	if dtype==generate.F64 and not all(has_double(d) for d in context.devices):
117	raise RuntimeError("Double precision not supported for devices")
118
119	header = F64_DEFS if dtype == generate.F64 else ""
120	if dtype == generate.F32:
121	source = generate.use_single(source)
122	# Note: USE_SINCOS makes the intel cpu slower under opencl
123	if context.devices[0].type == cl.device_type.GPU:
124	header += "#define USE_SINCOS\n"
125	program = cl.Program(context, header+source).build()
126	return program
127
128
129	def make_result(self, size):
130	self.res = np.empty(size, dtype=self.dtype)
131	self.res_b = cl.Buffer(self.program.context, mf.READ_WRITE, self.res.nbytes)
132	return self.res, self.res_b
133
134
135	# for now, this returns one device in the context
136	# TODO: create a context that contains all devices on all platforms
137	class GpuEnvironment(object):
138	"""
139	GPU context, with possibly many devices, and one queue per device.
140	"""
141	def __init__(self):
142	self.context = cl.create_some_context()
143	self.queues = [cl.CommandQueue(self.context, d)
144	for d in self.context.devices]
145	# Byte boundary for data alignment
146	#self.data_boundary = max(d.min_data_type_align_size
147	# for d in self.context.devices)
148	self.has_double = all(has_double(d) for d in self.context.devices)
149	self.compiled = {}
150
151	def compile_program(self, name, source, dtype):
152	if name not in self.compiled:
153	#print "compiling",name
154	self.compiled[name] = compile_model(self.context, source, dtype)
155	return self.compiled[name]
156
157	def release_program(self, name):
158	if name in self.compiled:
159	self.compiled[name].release()
160	del self.compiled[name]
161
162	class GpuModel(object):
163	"""
164	GPU wrapper for a single model.
165
166	source and info are the model source and interface as returned
167	from :func:`gen.make`.
168
169	dtype is the desired model precision. Any numpy dtype for single
170	or double precision floats will do, such as 'f', 'float32' or 'single'
171	for single and 'd', 'float64' or 'double' for double. Double precision
172	is an optional extension which may not be available on all devices.
173	"""
174	def __init__(self, source, info, dtype=generate.F32):
175	self.info = info
176	self.source = source
177	self.dtype = dtype
178	self.program = None # delay program creation
179
180	def __getstate__(self):
181	state = self.__dict__.copy()
182	state['program'] = None
183	return state
184
185	def __setstate__(self, state):
186	self.__dict__ = state.copy()
187
188	def __call__(self, input):
189	# Support pure python kernel call
190	if input.is_2D and callable(self.info['Iqxy']):
191	return PyKernel(self.info['Iqxy'], self.info, input)
192	elif not input.is_2D and callable(self.info['Iq']):
193	return PyKernel(self.info['Iq'], self.info, input)
194
195	if self.dtype != input.dtype:
196	raise TypeError("data and kernel have different types")
197	if self.program is None:
198	self.program = environment().compile_program(self.info['name'],self.source, self.dtype)
199	kernel_name = generate.kernel_name(self.info, input.is_2D)
200	kernel = getattr(self.program, kernel_name)
201	return GpuKernel(kernel, self.info, input)
202
203	def release(self):
204	if self.program is not None:
205	environment().release_program(self.info['name'])
206	self.program = None
207
208	def make_input(self, q_vectors):
209	"""
210	Make q input vectors available to the model.
211
212	Note that each model needs its own q vector even if the case of
213	mixture models because some models may be OpenCL, some may be
214	ctypes and some may be pure python.
215	"""
216	# Support pure python kernel call
217	if len(q_vectors) == 1 and callable(self.info['Iq']):
218	return PyInput(q_vectors, dtype=self.dtype)
219	elif callable(self.info['Iqxy']):
220	return PyInput(q_vectors, dtype=self.dtype)
221	else:
222	return GpuInput(q_vectors, dtype=self.dtype)
223
224	# TODO: check that we don't need a destructor for buffers which go out of scope
225	class GpuInput(object):
226	"""
227	Make q data available to the gpu.
228
229	q_vectors is a list of q vectors, which will be [q] for 1-D data,
230	and [qx, qy] for 2-D data. Internally, the vectors will be reallocated
231	to get the best performance on OpenCL, which may involve shifting and
232	stretching the array to better match the memory architecture. Additional
233	points will be evaluated with q=1e-3.
234
235	dtype is the data type for the q vectors. The data type should be
236	set to match that of the kernel, which is an attribute of
237	:class:`GpuProgram`. Note that not all kernels support double
238	precision, so even if the program was created for double precision,
239	the GpuProgram.dtype may be single precision.
240
241	Call :meth:`release` when complete. Even if not called directly, the
242	buffer will be released when the data object is freed.
243	"""
244	def __init__(self, q_vectors, dtype=generate.F32):
245	env = environment()
246	self.nq = q_vectors[0].size
247	self.dtype = np.dtype(dtype)
248	self.is_2D = (len(q_vectors) == 2)
249	# TODO: stretch input based on get_warp()
250	# not doing it now since warp depends on kernel, which is not known
251	# at this point, so instead using 32, which is good on the set of
252	# architectures tested so far.
253	self.q_vectors = [_stretch_input(q, self.dtype, 32) for q in q_vectors]
254	self.q_buffers = [
255	cl.Buffer(env.context, mf.READ_ONLY \| mf.COPY_HOST_PTR, hostbuf=q)
256	for q in self.q_vectors
257	]
258	self.global_size = [self.q_vectors[0].size]
259
260	def release(self):
261	for b in self.q_buffers:
262	b.release()
263	self.q_buffers = []
264
265	class GpuKernel(object):
266	"""
267	Callable SAS kernel.
268
269	kernel is the GpuKernel object to call.
270
271	info is the module information
272
273	input is the DllInput q vectors at which the kernel should be
274	evaluated.
275
276	The resulting call method takes the pars, a list of values for
277	the fixed parameters to the kernel, and pd_pars, a list of (value,weight)
278	vectors for the polydisperse parameters. cutoff determines the
279	integration limits: any points with combined weight less than cutoff
280	will not be calculated.
281
282	Call :meth:`release` when done with the kernel instance.
283	"""
284	def __init__(self, kernel, info, input):
285	self.input = input
286	self.kernel = kernel
287	self.info = info
288	self.res = np.empty(input.nq, input.dtype)
289	dim = '2d' if input.is_2D else '1d'
290	self.fixed_pars = info['partype']['fixed-'+dim]
291	self.pd_pars = info['partype']['pd-'+dim]
292
293	# Inputs and outputs for each kernel call
294	# Note: res may be shorter than res_b if global_size != nq
295	env = environment()
296	self.loops_b = [cl.Buffer(env.context, mf.READ_WRITE,
297	2MAX_LOOPSinput.dtype.itemsize)
298	for _ in env.queues]
299	self.res_b = [cl.Buffer(env.context, mf.READ_WRITE,
300	input.global_size[0]*input.dtype.itemsize)
301	for _ in env.queues]
302
303
304	def __call__(self, pars, pd_pars, cutoff=1e-5):
305	real = np.float32 if self.input.dtype == generate.F32 else np.float64
306	fixed = [real(p) for p in pars]
307	cutoff = real(cutoff)
308	loops = np.hstack(pd_pars)
309	loops = np.ascontiguousarray(loops.T, self.input.dtype).flatten()
310	Nloops = [np.uint32(len(p[0])) for p in pd_pars]
311	#print "loops",Nloops, loops
312
313	#import sys; print >>sys.stderr,"opencl eval",pars
314	#print "opencl eval",pars
315	if len(loops) > 2*MAX_LOOPS:
316	raise ValueError("too many polydispersity points")
317	device_num = 0
318	res_bi = self.res_b[device_num]
319	queuei = environment().queues[device_num]
320	loops_bi = self.loops_b[device_num]
321	loops_l = cl.LocalMemory(len(loops.data))
322	cl.enqueue_copy(queuei, loops_bi, loops)
323	#ctx = environment().context
324	#loops_bi = cl.Buffer(ctx, mf.READ_ONLY \| mf.COPY_HOST_PTR, hostbuf=loops)
325	args = self.input.q_buffers + [res_bi,loops_bi,loops_l,cutoff] + fixed + Nloops
326	self.kernel(queuei, self.input.global_size, None, *args)
327	cl.enqueue_copy(queuei, self.res, res_bi)
328
329	return self.res
330
331	def release(self):
332	for b in self.loops_b:
333	b.release()
334	self.loops_b = []
335	for b in self.res_b:
336	b.release()
337	self.res_b = []
338
339	def __del__(self):
340	self.release()

Note: See TracBrowser for help on using the repository browser.

SasView

source: sasmodels/sasmodels/gpu.py @ b3f6bc3

Download in other formats: