gpu.py @ 14de349

core_shell_microgelscostrafo411magnetic_modelrelease_v0.94release_v0.95ticket-1257-vesicle-productticket_1156ticket_1265_superballticket_822_more_unit_tests

Last change on this file since 14de349 was 14de349, checked in by Paul Kienzle <pkienzle@…>, 10 years ago
add autogenerated polydispersity loops
Property mode set to `100644`
File size: 9.7 KB

Line
1	"""
2	GPU support through OpenCL
3
4	There should be a single GPU environment running on the system. This
5	environment is constructed on the first call to :func:`env`, and the
6	same environment is returned on each call.
7
8	After retrieving the environment, the next step is to create the kernel.
9	This is done with a call to :meth:`GpuEnvironment.make_kernel`, which
10	returns the type of data used by the kernel.
11
12	Next a :class:`GpuData` object should be created with the correct kind
13	of data. This data object can be used by multiple kernels, for example,
14	if the target model is a weighted sum of multiple kernels. The data
15	should include any extra evaluation points required to compute the proper
16	data smearing. This need not match the square grid for 2D data if there
17	is an index saying which q points are active.
18
19	Together the GpuData, the program, and a device form a :class:`GpuKernel`.
20	This kernel is used during fitting, receiving new sets of parameters and
21	evaluating them. The output value is stored in an output buffer on the
22	devices, where it can be combined with other structure factors and form
23	factors and have instrumental resolution effects applied.
24
25
26	"""
27	import warnings
28
29	import numpy as np
30	import pyopencl as cl
31	from pyopencl import mem_flags as mf
32
33	from . import gen
34
35	from .gen import F32, F64
36
37	F32_DEFS = """\
38	#define REAL(x) (x##f)
39	#define real float
40	"""
41
42	F64_DEFS = """\
43	#pragma OPENCL EXTENSION cl_khr_fp64: enable
44	#define REAL(x) (x)
45	#define real double
46	"""
47
48	ENV = None
49	def environment():
50	"""
51	Returns a singleton :class:`GpuEnvironment`.
52
53	This provides an OpenCL context and one queue per device.
54	"""
55	global ENV
56	if ENV is None:
57	ENV = GpuEnvironment()
58	return ENV
59
60	def has_double(device):
61	"""
62	Return true if device supports double precision.
63	"""
64	return "cl_khr_fp64" in device.extensions
65
66
67	def _stretch_input(vector, dtype, extra=1e-3, boundary=128):
68	"""
69	Stretch an input vector to the correct boundary.
70
71	Performance on the kernels can drop by a factor of two or more if the
72	number of values to compute does not fall on a nice power of two
73	boundary. A good choice for the boundary value is the
74	min_data_type_align_size property of the OpenCL device. The usual
75	value of 128 gives a working size as a multiple of 32. The trailing
76	additional vector elements are given a value of extra, and so
77	f(extra) will be computed for each of them. The returned array
78	will thus be a subset of the computed array.
79	"""
80	boundary // dtype.itemsize
81	remainder = vector.size%boundary
82	size = vector.size + (boundary - remainder if remainder != 0 else 0)
83	if size != vector.size:
84	vector = np.hstack((vector, [extra]*(size-vector.size)))
85	return np.ascontiguousarray(vector, dtype=dtype)
86
87
88	def compile_model(context, source, dtype):
89	"""
90	Build a model to run on the gpu.
91
92	Returns the compiled program and its type. The returned type will
93	be float32 even if the desired type is float64 if any of the
94	devices in the context do not support the cl_khr_fp64 extension.
95	"""
96	dtype = np.dtype(dtype)
97	if dtype==F64 and not all(has_double(d) for d in context.devices):
98	warnings.warn(RuntimeWarning("Double precision not support; using single precision instead"))
99	dtype = F32
100
101	header = F64_DEFS if dtype == F64 else F32_DEFS
102	# Note: USE_SINCOS makes the intel cpu slower under opencl
103	if context.devices[0].type == cl.device_type.GPU:
104	header += "#define USE_SINCOS\n"
105	program = cl.Program(context, header+source).build()
106	return program, dtype
107
108
109	def make_result(self, size):
110	self.res = np.empty(size, dtype=self.dtype)
111	self.res_b = cl.Buffer(self.program.context, mf.READ_WRITE, self.res.nbytes)
112	return self.res, self.res_b
113
114
115	# for now, this returns one device in the context
116	# TODO: create a context that contains all devices on all platforms
117	class GpuEnvironment(object):
118	"""
119	GPU context, with possibly many devices, and one queue per device.
120	"""
121	def __init__(self):
122	self.context = cl.create_some_context()
123	self.queues = [cl.CommandQueue(self.context, d)
124	for d in self.context.devices]
125	self.boundary = max(d.min_data_type_align_size
126	for d in self.context.devices)
127
128	class GpuModel(object):
129	"""
130	GPU wrapper for a single model.
131
132	source and meta are the model source and interface as returned
133	from :func:`gen.make`.
134
135	dtype is the desired model precision. Any numpy dtype for single
136	or double precision floats will do, such as 'f', 'float32' or 'single'
137	for single and 'd', 'float64' or 'double' for double. Double precision
138	is an optional extension which may not be available on all devices.
139	"""
140	def __init__(self, source, meta, dtype=F32):
141	context = environment().context
142	self.meta = meta
143	self.program, self.dtype = compile_model(context, source, dtype)
144
145	#TODO: need better interface
146	self.PARS = dict((p[0],p[2]) for p in meta['parameters'])
147	self.PD_PARS = [p[0] for p in meta['parameters'] if p[4] != ""]
148
149	def __call__(self, input, cutoff=1e-5):
150	if self.dtype != input.dtype:
151	raise TypeError("data and kernel have different types")
152	kernel_name = gen.kernel_name(self.meta, input.is_2D)
153	kernel = getattr(self.program, kernel_name)
154	return GpuKernel(kernel, self.meta, input, cutoff)
155
156	def make_input(self, q_vectors):
157	"""
158	Make q input vectors available to the model.
159
160	This only needs to be done once for all models that operate on the
161	same input. So for example, if you are adding two different models
162	together to compare to a data set, then only one model needs to
163	needs to call make_input, so long as the models have the same dtype.
164	"""
165	# Note: the weird interface, where make_input doesn't care which
166	# model calls it, allows us to ask the model to define the data
167	# and the caller does not need to know if it is opencl or ctypes.
168	# The returned data object is opaque.
169	return GpuInput(q_vectors, dtype=self.dtype)
170
171	# TODO: check that we don't need a destructor for buffers which go out of scope
172	class GpuInput(object):
173	"""
174	Make q data available to the gpu.
175
176	q_vectors is a list of q vectors, which will be [q] for 1-D data,
177	and [qx, qy] for 2-D data. Internally, the vectors will be reallocated
178	to get the best performance on OpenCL, which may involve shifting and
179	stretching the array to better match the memory architecture. Additional
180	points will be evaluated with q=1e-3.
181
182	dtype is the data type for the q vectors. The data type should be
183	set to match that of the kernel, which is an attribute of
184	:class:`GpuProgram`. Note that not all kernels support double
185	precision, so even if the program was created for double precision,
186	the GpuProgram.dtype may be single precision.
187
188	Call :meth:`release` when complete. Even if not called directly, the
189	buffer will be released when the data object is freed.
190	"""
191	def __init__(self, q_vectors, dtype=F32):
192	env = environment()
193	self.nq = q_vectors[0].size
194	self.dtype = np.dtype(dtype)
195	self.is_2D = (len(q_vectors) == 2)
196	self.q_vectors = [
197	_stretch_input(q, self.dtype, boundary=env.boundary)
198	for q in q_vectors
199	]
200	self.q_buffers = [
201	cl.Buffer(env.context, mf.READ_ONLY \| mf.COPY_HOST_PTR, hostbuf=q)
202	for q in self.q_vectors
203	]
204	self.global_size = [self.q_vectors[0].size]
205
206	def release(self):
207	for b in self.q_buffers:
208	b.release()
209	self.q_buffers = []
210
211	class GpuKernel(object):
212	def __init__(self, kernel, meta, input, cutoff):
213	env = environment()
214
215	self.cutoff = cutoff
216	self.input = input
217	self.kernel = kernel
218	self.meta = meta
219
220	# Inputs and outputs for each kernel call
221	self.loops_b = [cl.Buffer(env.context, mf.READ_WRITE,
222	1024*input.dtype.itemsize)
223	for _ in env.queues]
224	self.res_b = [cl.Buffer(env.context, mf.READ_WRITE,
225	input.global_size[0]*input.dtype.itemsize)
226	for _ in env.queues]
227
228	# Note: may be shorter than res_b if global_size != nq
229	self.res = np.empty(input.nq, input.dtype)
230
231	# Determine the set of fixed and polydisperse parameters
232	self.fixed_pars = [p[0] for p in meta['parameters'] if p[4] == ""]
233	self.pd_pars = [p for p in meta['parameters']
234	if p[4]=="volume" or (p[4]=="orientation" and input.is_2D)]
235
236	def eval(self, pars):
237	device_num = 0
238	res_bi = self.res_b[device_num]
239	queuei = environment().queues[device_num]
240	fixed, loops, loop_n = \
241	gen.kernel_pars(pars, self.meta, self.input.is_2D, dtype=self.input.dtype)
242	loops_l = cl.LocalMemory(len(loops.data))
243	real = np.float32 if self.input.dtype == F32 else np.float64
244	cutoff = real(self.cutoff)
245
246	loops_bi = self.loops_b[device_num]
247	cl.enqueue_copy(queuei, loops_bi, loops)
248	#ctx = environment().context
249	#loops_bi = cl.Buffer(ctx, mf.READ_ONLY \| mf.COPY_HOST_PTR, hostbuf=loops)
250	pars = self.input.q_buffers + [res_bi,loops_bi,loops_l,cutoff] + fixed + loop_n
251	self.kernel(queuei, self.input.global_size, None, *pars)
252	cl.enqueue_copy(queuei, self.res, res_bi)
253
254	return self.res
255
256	def release(self):
257	for b in self.loops_b:
258	b.release()
259	self.loops_b = []
260	for b in self.res_b:
261	b.release()
262	self.res_b = []
263
264	def __del__(self):
265	self.release()

Note: See TracBrowser for help on using the repository browser.

SasView

source: sasmodels/sasmodels/gpu.py @ 14de349

Download in other formats: