source: sasmodels/sasmodels/kernelcl.py @ 0763009

core_shell_microgelscostrafo411magnetic_modelrelease_v0.94release_v0.95ticket-1257-vesicle-productticket_1156ticket_1265_superballticket_822_more_unit_tests
Last change on this file since 0763009 was 0763009, checked in by Paul Kienzle <pkienzle@…>, 8 years ago

minor code cleanup

  • Property mode set to 100644
File size: 13.3 KB
Line 
1"""
2GPU support through OpenCL
3
4There should be a single GPU environment running on the system.  This
5environment is constructed on the first call to :func:`env`, and the
6same environment is returned on each call.
7
8After retrieving the environment, the next step is to create the kernel.
9This is done with a call to :meth:`GpuEnvironment.make_kernel`, which
10returns the type of data used by the kernel.
11
12Next a :class:`GpuData` object should be created with the correct kind
13of data.  This data object can be used by multiple kernels, for example,
14if the target model is a weighted sum of multiple kernels.  The data
15should include any extra evaluation points required to compute the proper
16data smearing.  This need not match the square grid for 2D data if there
17is an index saying which q points are active.
18
19Together the GpuData, the program, and a device form a :class:`GpuKernel`.
20This kernel is used during fitting, receiving new sets of parameters and
21evaluating them.  The output value is stored in an output buffer on the
22devices, where it can be combined with other structure factors and form
23factors and have instrumental resolution effects applied.
24"""
25import os
26import warnings
27
28import numpy as np
29
30try:
31    import pyopencl as cl
32    # Ask OpenCL for the default context so that we know that one exists
33    cl.create_some_context(interactive=False)
34except Exception, exc:
35    warnings.warn(str(exc))
36    raise RuntimeError("OpenCL not available")
37
38from pyopencl import mem_flags as mf
39
40from . import generate
41
42F64_DEFS = """\
43#ifdef cl_khr_fp64
44#  pragma OPENCL EXTENSION cl_khr_fp64: enable
45#endif
46"""
47
48# The max loops number is limited by the amount of local memory available
49# on the device.  You don't want to make this value too big because it will
50# waste resources, nor too small because it may interfere with users trying
51# to do their polydispersity calculations.  A value of 1024 should be much
52# larger than necessary given that cost grows as npts^k where k is the number
53# of polydisperse parameters.
54MAX_LOOPS = 2048
55
56
57ENV = None
58def environment():
59    """
60    Returns a singleton :class:`GpuEnvironment`.
61
62    This provides an OpenCL context and one queue per device.
63    """
64    global ENV
65    if ENV is None:
66        ENV = GpuEnvironment()
67    return ENV
68
69def has_double(device):
70    """
71    Return true if device supports double precision.
72    """
73    return "cl_khr_fp64" in device.extensions
74
75def get_warp(kernel, queue):
76    """
77    Return the size of an execution batch for *kernel* running on *queue*.
78    """
79    return kernel.get_work_group_info(
80        cl.kernel_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
81        queue.device)
82
83def _stretch_input(vector, dtype, extra=1e-3, boundary=32):
84    """
85    Stretch an input vector to the correct boundary.
86
87    Performance on the kernels can drop by a factor of two or more if the
88    number of values to compute does not fall on a nice power of two
89    boundary.   The trailing additional vector elements are given a
90    value of *extra*, and so f(*extra*) will be computed for each of
91    them.  The returned array will thus be a subset of the computed array.
92
93    *boundary* should be a power of 2 which is at least 32 for good
94    performance on current platforms (as of Jan 2015).  It should
95    probably be the max of get_warp(kernel,queue) and
96    device.min_data_type_align_size//4.
97    """
98    remainder = vector.size % boundary
99    if remainder != 0:
100        size = vector.size + (boundary - remainder)
101        vector = np.hstack((vector, [extra] * (size - vector.size)))
102    return np.ascontiguousarray(vector, dtype=dtype)
103
104
105def compile_model(context, source, dtype):
106    """
107    Build a model to run on the gpu.
108
109    Returns the compiled program and its type.  The returned type will
110    be float32 even if the desired type is float64 if any of the
111    devices in the context do not support the cl_khr_fp64 extension.
112    """
113    dtype = np.dtype(dtype)
114    if dtype == generate.F64 and not all(has_double(d) for d in context.devices):
115        raise RuntimeError("Double precision not supported for devices")
116
117    header = F64_DEFS if dtype == generate.F64 else ""
118    if dtype == generate.F32:
119        source = generate.use_single(source)
120    # Note: USE_SINCOS makes the intel cpu slower under opencl
121    if context.devices[0].type == cl.device_type.GPU:
122        header += "#define USE_SINCOS\n"
123    program = cl.Program(context, header + source).build()
124    return program
125
126
127def make_result(self, size):
128    self.res = np.empty(size, dtype=self.dtype)
129    self.res_b = cl.Buffer(self.program.context, mf.READ_WRITE, self.res.nbytes)
130    return self.res, self.res_b
131
132
133# for now, this returns one device in the context
134# TODO: create a context that contains all devices on all platforms
135class GpuEnvironment(object):
136    """
137    GPU context, with possibly many devices, and one queue per device.
138    """
139    def __init__(self):
140        # find gpu context
141        #self.context = cl.create_some_context()
142
143        self.context = None
144        if 'PYOPENCL_CTX' in os.environ:
145            self._create_some_context()
146
147        if not self.context:
148            self.context = _get_default_context()
149
150        # Byte boundary for data alignment
151        #self.data_boundary = max(d.min_data_type_align_size
152        #                         for d in self.context.devices)
153        self.queues = [cl.CommandQueue(self.context, d)
154                       for d in self.context.devices]
155        self.has_double = all(has_double(d) for d in self.context.devices)
156        self.compiled = {}
157
158    def _create_some_context(self):
159        try:
160            self.context = cl.create_some_context(interactive=False)
161        except Exception, exc:
162            warnings.warn(str(exc))
163            warnings.warn("pyopencl.create_some_context() failed")
164            warnings.warn("the environment variable 'PYOPENCL_CTX' might not be set correctly")
165
166    def compile_program(self, name, source, dtype):
167        if name not in self.compiled:
168            #print "compiling",name
169            self.compiled[name] = compile_model(self.context, source, dtype)
170        return self.compiled[name]
171
172    def release_program(self, name):
173        if name in self.compiled:
174            self.compiled[name].release()
175            del self.compiled[name]
176
177def _get_default_context():
178    default = None
179    for platform in cl.get_platforms():
180        for device in platform.get_devices():
181            if device.type == cl.device_type.GPU:
182                return cl.Context([device])
183            if default is None:
184                default = device
185
186    if not default:
187        raise RuntimeError("OpenCL device not found")
188
189    return cl.Context([default])
190
191
192class GpuModel(object):
193    """
194    GPU wrapper for a single model.
195
196    *source* and *info* are the model source and interface as returned
197    from :func:`gen.make`.
198
199    *dtype* is the desired model precision.  Any numpy dtype for single
200    or double precision floats will do, such as 'f', 'float32' or 'single'
201    for single and 'd', 'float64' or 'double' for double.  Double precision
202    is an optional extension which may not be available on all devices.
203    """
204    def __init__(self, source, info, dtype=generate.F32):
205        self.info = info
206        self.source = source
207        self.dtype = np.dtype(dtype)
208        self.program = None # delay program creation
209
210    def __getstate__(self):
211        state = self.__dict__.copy()
212        state['program'] = None
213        return state
214
215    def __setstate__(self, state):
216        self.__dict__ = state.copy()
217
218    def __call__(self, q_input):
219        if self.dtype != q_input.dtype:
220            raise TypeError("data is %s kernel is %s" % (q_input.dtype, self.dtype))
221        if self.program is None:
222            compiler = environment().compile_program
223            self.program = compiler(self.info['name'], self.source, self.dtype)
224        kernel_name = generate.kernel_name(self.info, q_input.is_2D)
225        kernel = getattr(self.program, kernel_name)
226        return GpuKernel(kernel, self.info, q_input)
227
228    def release(self):
229        if self.program is not None:
230            environment().release_program(self.info['name'])
231            self.program = None
232
233    def make_input(self, q_vectors):
234        """
235        Make q input vectors available to the model.
236
237        Note that each model needs its own q vector even if the case of
238        mixture models because some models may be OpenCL, some may be
239        ctypes and some may be pure python.
240        """
241        return GpuInput(q_vectors, dtype=self.dtype)
242
243# TODO: check that we don't need a destructor for buffers which go out of scope
244class GpuInput(object):
245    """
246    Make q data available to the gpu.
247
248    *q_vectors* is a list of q vectors, which will be *[q]* for 1-D data,
249    and *[qx, qy]* for 2-D data.  Internally, the vectors will be reallocated
250    to get the best performance on OpenCL, which may involve shifting and
251    stretching the array to better match the memory architecture.  Additional
252    points will be evaluated with *q=1e-3*.
253
254    *dtype* is the data type for the q vectors. The data type should be
255    set to match that of the kernel, which is an attribute of
256    :class:`GpuProgram`.  Note that not all kernels support double
257    precision, so even if the program was created for double precision,
258    the *GpuProgram.dtype* may be single precision.
259
260    Call :meth:`release` when complete.  Even if not called directly, the
261    buffer will be released when the data object is freed.
262    """
263    def __init__(self, q_vectors, dtype=generate.F32):
264        env = environment()
265        self.nq = q_vectors[0].size
266        self.dtype = np.dtype(dtype)
267        self.is_2D = (len(q_vectors) == 2)
268        # TODO: stretch input based on get_warp()
269        # not doing it now since warp depends on kernel, which is not known
270        # at this point, so instead using 32, which is good on the set of
271        # architectures tested so far.
272        self.q_vectors = [_stretch_input(q, self.dtype, 32) for q in q_vectors]
273        self.q_buffers = [
274            cl.Buffer(env.context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=q)
275            for q in self.q_vectors
276        ]
277        self.global_size = [self.q_vectors[0].size]
278
279    def release(self):
280        for b in self.q_buffers:
281            b.release()
282        self.q_buffers = []
283
284class GpuKernel(object):
285    """
286    Callable SAS kernel.
287
288    *kernel* is the GpuKernel object to call.
289
290    *info* is the module information
291
292    *q_input* is the DllInput q vectors at which the kernel should be
293    evaluated.
294
295    The resulting call method takes the *pars*, a list of values for
296    the fixed parameters to the kernel, and *pd_pars*, a list of (value,weight)
297    vectors for the polydisperse parameters.  *cutoff* determines the
298    integration limits: any points with combined weight less than *cutoff*
299    will not be calculated.
300
301    Call :meth:`release` when done with the kernel instance.
302    """
303    def __init__(self, kernel, info, q_input):
304        self.q_input = q_input
305        self.kernel = kernel
306        self.info = info
307        self.res = np.empty(q_input.nq, q_input.dtype)
308        dim = '2d' if q_input.is_2D else '1d'
309        self.fixed_pars = info['partype']['fixed-' + dim]
310        self.pd_pars = info['partype']['pd-' + dim]
311
312        # Inputs and outputs for each kernel call
313        # Note: res may be shorter than res_b if global_size != nq
314        env = environment()
315        self.loops_b = [cl.Buffer(env.context, mf.READ_WRITE,
316                                  2 * MAX_LOOPS * q_input.dtype.itemsize)
317                        for _ in env.queues]
318        self.res_b = [cl.Buffer(env.context, mf.READ_WRITE,
319                                q_input.global_size[0] * q_input.dtype.itemsize)
320                      for _ in env.queues]
321
322
323    def __call__(self, fixed_pars, pd_pars, cutoff=1e-5):
324        real = np.float32 if self.q_input.dtype == generate.F32 else np.float64
325
326        device_num = 0
327        queuei = environment().queues[device_num]
328        res_bi = self.res_b[device_num]
329        nq = np.uint32(self.q_input.nq)
330        if pd_pars:
331            cutoff = real(cutoff)
332            loops_N = [np.uint32(len(p[0])) for p in pd_pars]
333            loops = np.hstack(pd_pars) \
334                if pd_pars else np.empty(0, dtype=self.q_input.dtype)
335            loops = np.ascontiguousarray(loops.T, self.q_input.dtype).flatten()
336            #print "loops",Nloops, loops
337
338            #import sys; print >>sys.stderr,"opencl eval",pars
339            #print "opencl eval",pars
340            if len(loops) > 2 * MAX_LOOPS:
341                raise ValueError("too many polydispersity points")
342
343            loops_bi = self.loops_b[device_num]
344            cl.enqueue_copy(queuei, loops_bi, loops)
345            loops_l = cl.LocalMemory(len(loops.data))
346            #ctx = environment().context
347            #loops_bi = cl.Buffer(ctx, mf.READ_ONLY|mf.COPY_HOST_PTR, hostbuf=loops)
348            dispersed = [loops_bi, loops_l, cutoff] + loops_N
349        else:
350            dispersed = []
351        fixed = [real(p) for p in fixed_pars]
352        args = self.q_input.q_buffers + [res_bi, nq] + dispersed + fixed
353        self.kernel(queuei, self.q_input.global_size, None, *args)
354        cl.enqueue_copy(queuei, self.res, res_bi)
355
356        return self.res
357
358    def release(self):
359        for b in self.loops_b:
360            b.release()
361        self.loops_b = []
362        for b in self.res_b:
363            b.release()
364        self.res_b = []
365
366    def __del__(self):
367        self.release()
Note: See TracBrowser for help on using the repository browser.