source: sasmodels/sasmodels/kernelcl.py @ ae2b6b5

core_shell_microgelscostrafo411magnetic_modelrelease_v0.94release_v0.95ticket-1257-vesicle-productticket_1156ticket_1265_superballticket_822_more_unit_tests
Last change on this file since ae2b6b5 was ae2b6b5, checked in by Paul Kienzle <pkienzle@…>, 8 years ago

increase code correspondance between iq.c and iq.cl

  • Property mode set to 100644
File size: 21.0 KB
Line 
1"""
2GPU driver for C kernels
3
4There should be a single GPU environment running on the system.  This
5environment is constructed on the first call to :func:`env`, and the
6same environment is returned on each call.
7
8After retrieving the environment, the next step is to create the kernel.
9This is done with a call to :meth:`GpuEnvironment.make_kernel`, which
10returns the type of data used by the kernel.
11
12Next a :class:`GpuData` object should be created with the correct kind
13of data.  This data object can be used by multiple kernels, for example,
14if the target model is a weighted sum of multiple kernels.  The data
15should include any extra evaluation points required to compute the proper
16data smearing.  This need not match the square grid for 2D data if there
17is an index saying which q points are active.
18
19Together the GpuData, the program, and a device form a :class:`GpuKernel`.
20This kernel is used during fitting, receiving new sets of parameters and
21evaluating them.  The output value is stored in an output buffer on the
22devices, where it can be combined with other structure factors and form
23factors and have instrumental resolution effects applied.
24
25In order to use OpenCL for your models, you will need OpenCL drivers for
26your machine.  These should be available from your graphics card vendor.
27Intel provides OpenCL drivers for CPUs as well as their integrated HD
28graphics chipsets.  AMD also provides drivers for Intel CPUs, but as of
29this writing the performance is lacking compared to the Intel drivers.
30NVidia combines drivers for CUDA and OpenCL in one package.  The result
31is a bit messy if you have multiple drivers installed.  You can see which
32drivers are available by starting python and running:
33
34    import pyopencl as cl
35    cl.create_some_context(interactive=True)
36
37Once you have done that, it will show the available drivers which you
38can select.  It will then tell you that you can use these drivers
39automatically by setting the PYOPENCL_CTX environment variable.
40
41Some graphics cards have multiple devices on the same card.  You cannot
42yet use both of them concurrently to evaluate models, but you can run
43the program twice using a different device for each session.
44
45OpenCL kernels are compiled when needed by the device driver.  Some
46drivers produce compiler output even when there is no error.  You
47can see the output by setting PYOPENCL_COMPILER_OUTPUT=1.  It should be
48harmless, albeit annoying.
49"""
50from __future__ import print_function
51
52import os
53import warnings
54
55import numpy as np  # type: ignore
56
57try:
58    #raise NotImplementedError("OpenCL not yet implemented for new kernel template")
59    import pyopencl as cl  # type: ignore
60    # Ask OpenCL for the default context so that we know that one exists
61    cl.create_some_context(interactive=False)
62except Exception as ocl_exc:
63    warnings.warn(str(ocl_exc))
64    del ocl_exc
65    raise RuntimeError("OpenCL not available")
66
67from pyopencl import mem_flags as mf  # type: ignore
68from pyopencl.characterize import get_fast_inaccurate_build_options  # type: ignore
69
70from . import generate
71from .kernel import KernelModel, Kernel
72
73try:
74    from typing import Tuple, Callable, Any
75    from .modelinfo import ModelInfo
76    from .details import CallDetails
77except ImportError:
78    pass
79
80# The max loops number is limited by the amount of local memory available
81# on the device.  You don't want to make this value too big because it will
82# waste resources, nor too small because it may interfere with users trying
83# to do their polydispersity calculations.  A value of 1024 should be much
84# larger than necessary given that cost grows as npts^k where k is the number
85# of polydisperse parameters.
86MAX_LOOPS = 2048
87
88
89# Pragmas for enable OpenCL features.  Be sure to protect them so that they
90# still compile even if OpenCL is not present.
91_F16_PRAGMA = """\
92#if defined(__OPENCL_VERSION__) // && !defined(cl_khr_fp16)
93#  pragma OPENCL EXTENSION cl_khr_fp16: enable
94#endif
95"""
96
97_F64_PRAGMA = """\
98#if defined(__OPENCL_VERSION__) // && !defined(cl_khr_fp64)
99#  pragma OPENCL EXTENSION cl_khr_fp64: enable
100#endif
101"""
102
103
104ENV = None
105def environment():
106    # type: () -> "GpuEnvironment"
107    """
108    Returns a singleton :class:`GpuEnvironment`.
109
110    This provides an OpenCL context and one queue per device.
111    """
112    global ENV
113    if ENV is None:
114        ENV = GpuEnvironment()
115    return ENV
116
117def has_type(device, dtype):
118    # type: (cl.Device, np.dtype) -> bool
119    """
120    Return true if device supports the requested precision.
121    """
122    if dtype == generate.F32:
123        return True
124    elif dtype == generate.F64:
125        return "cl_khr_fp64" in device.extensions
126    elif dtype == generate.F16:
127        return "cl_khr_fp16" in device.extensions
128    else:
129        return False
130
131def get_warp(kernel, queue):
132    # type: (cl.Kernel, cl.CommandQueue) -> int
133    """
134    Return the size of an execution batch for *kernel* running on *queue*.
135    """
136    return kernel.get_work_group_info(
137        cl.kernel_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
138        queue.device)
139
140def _stretch_input(vector, dtype, extra=1e-3, boundary=32):
141    # type: (np.ndarray, np.dtype, float, int) -> np.ndarray
142    """
143    Stretch an input vector to the correct boundary.
144
145    Performance on the kernels can drop by a factor of two or more if the
146    number of values to compute does not fall on a nice power of two
147    boundary.   The trailing additional vector elements are given a
148    value of *extra*, and so f(*extra*) will be computed for each of
149    them.  The returned array will thus be a subset of the computed array.
150
151    *boundary* should be a power of 2 which is at least 32 for good
152    performance on current platforms (as of Jan 2015).  It should
153    probably be the max of get_warp(kernel,queue) and
154    device.min_data_type_align_size//4.
155    """
156    remainder = vector.size % boundary
157    if remainder != 0:
158        size = vector.size + (boundary - remainder)
159        vector = np.hstack((vector, [extra] * (size - vector.size)))
160    return np.ascontiguousarray(vector, dtype=dtype)
161
162
163def compile_model(context, source, dtype, fast=False):
164    # type: (cl.Context, str, np.dtype, bool) -> cl.Program
165    """
166    Build a model to run on the gpu.
167
168    Returns the compiled program and its type.  The returned type will
169    be float32 even if the desired type is float64 if any of the
170    devices in the context do not support the cl_khr_fp64 extension.
171    """
172    dtype = np.dtype(dtype)
173    if not all(has_type(d, dtype) for d in context.devices):
174        raise RuntimeError("%s not supported for devices"%dtype)
175
176    source_list = [generate.convert_type(source, dtype)]
177
178    if dtype == generate.F16:
179        source_list.insert(0, _F16_PRAGMA)
180    elif dtype == generate.F64:
181        source_list.insert(0, _F64_PRAGMA)
182
183    # Note: USE_SINCOS makes the intel cpu slower under opencl
184    if context.devices[0].type == cl.device_type.GPU:
185        source_list.insert(0, "#define USE_SINCOS\n")
186    options = (get_fast_inaccurate_build_options(context.devices[0])
187               if fast else [])
188    source = "\n".join(source_list)
189    program = cl.Program(context, source).build(options=options)
190    return program
191
192
193# for now, this returns one device in the context
194# TODO: create a context that contains all devices on all platforms
195class GpuEnvironment(object):
196    """
197    GPU context, with possibly many devices, and one queue per device.
198    """
199    def __init__(self):
200        # type: () -> None
201        # find gpu context
202        #self.context = cl.create_some_context()
203
204        self.context = None
205        if 'PYOPENCL_CTX' in os.environ:
206            self._create_some_context()
207
208        if not self.context:
209            self.context = _get_default_context()
210
211        # Byte boundary for data alignment
212        #self.data_boundary = max(d.min_data_type_align_size
213        #                         for d in self.context.devices)
214        self.queues = [cl.CommandQueue(context, context.devices[0])
215                       for context in self.context]
216        self.compiled = {}
217
218    def has_type(self, dtype):
219        # type: (np.dtype) -> bool
220        """
221        Return True if all devices support a given type.
222        """
223        return any(has_type(d, dtype)
224                   for context in self.context
225                   for d in context.devices)
226
227    def get_queue(self, dtype):
228        # type: (np.dtype) -> cl.CommandQueue
229        """
230        Return a command queue for the kernels of type dtype.
231        """
232        for context, queue in zip(self.context, self.queues):
233            if all(has_type(d, dtype) for d in context.devices):
234                return queue
235
236    def get_context(self, dtype):
237        # type: (np.dtype) -> cl.Context
238        """
239        Return a OpenCL context for the kernels of type dtype.
240        """
241        for context, queue in zip(self.context, self.queues):
242            if all(has_type(d, dtype) for d in context.devices):
243                return context
244
245    def _create_some_context(self):
246        # type: () -> cl.Context
247        """
248        Protected call to cl.create_some_context without interactivity.  Use
249        this if PYOPENCL_CTX is set in the environment.  Sets the *context*
250        attribute.
251        """
252        try:
253            self.context = [cl.create_some_context(interactive=False)]
254        except Exception as exc:
255            warnings.warn(str(exc))
256            warnings.warn("pyopencl.create_some_context() failed")
257            warnings.warn("the environment variable 'PYOPENCL_CTX' might not be set correctly")
258
259    def compile_program(self, name, source, dtype, fast=False):
260        # type: (str, str, np.dtype, bool) -> cl.Program
261        """
262        Compile the program for the device in the given context.
263        """
264        key = "%s-%s-%s"%(name, dtype, fast)
265        if key not in self.compiled:
266            #print("OpenCL compile",name)
267            dtype = np.dtype(dtype)
268            program = compile_model(self.get_context(dtype),
269                                    str(source), dtype, fast)
270            self.compiled[key] = program
271        return self.compiled[key]
272
273    def release_program(self, name):
274        # type: (str) -> None
275        """
276        Free memory associated with the program on the device.
277        """
278        if name in self.compiled:
279            self.compiled[name].release()
280            del self.compiled[name]
281
282def _get_default_context():
283    # type: () -> cl.Context
284    """
285    Get an OpenCL context, preferring GPU over CPU, and preferring Intel
286    drivers over AMD drivers.
287    """
288    # Note: on mobile devices there is automatic clock scaling if either the
289    # CPU or the GPU is underutilized; probably doesn't affect us, but we if
290    # it did, it would mean that putting a busy loop on the CPU while the GPU
291    # is running may increase throughput.
292    #
293    # Macbook pro, base install:
294    #     {'Apple': [Intel CPU, NVIDIA GPU]}
295    # Macbook pro, base install:
296    #     {'Apple': [Intel CPU, Intel GPU]}
297    # 2 x nvidia 295 with Intel and NVIDIA opencl drivers installed
298    #     {'Intel': [CPU], 'NVIDIA': [GPU, GPU, GPU, GPU]}
299    gpu, cpu = None, None
300    for platform in cl.get_platforms():
301        # AMD provides a much weaker CPU driver than Intel/Apple, so avoid it.
302        # If someone has bothered to install the AMD/NVIDIA drivers, prefer them over the integrated
303        # graphics driver that may have been supplied with the CPU chipset.
304        preferred_cpu = platform.vendor.startswith('Intel') or platform.vendor.startswith('Apple')
305        preferred_gpu = platform.vendor.startswith('Advanced') or platform.vendor.startswith('NVIDIA')
306        for device in platform.get_devices():
307            if device.type == cl.device_type.GPU:
308                # If the existing type is not GPU then it will be CUSTOM or ACCELERATOR,
309                # so don't override it.
310                if gpu is None or (preferred_gpu and gpu.type == cl.device_type.GPU):
311                    gpu = device
312            elif device.type == cl.device_type.CPU:
313                if cpu is None or preferred_cpu:
314                    cpu = device
315            else:
316                # System has cl.device_type.ACCELERATOR or cl.device_type.CUSTOM
317                # Intel Phi for example registers as an accelerator
318                # Since the user installed a custom device on their system and went through the
319                # pain of sorting out OpenCL drivers for it, lets assume they really do want to
320                # use it as their primary compute device.
321                gpu = device
322
323    # order the devices by gpu then by cpu; when searching for an available device by data type they
324    # will be checked in this order, which means that if the gpu supports double then the cpu will never
325    # be used (though we may make it possible to explicitly request the cpu at some point).
326    devices = []
327    if gpu is not None:
328        devices.append(gpu)
329    if cpu is not None:
330        devices.append(cpu)
331    return [cl.Context([d]) for d in devices]
332
333
334class GpuModel(KernelModel):
335    """
336    GPU wrapper for a single model.
337
338    *source* and *model_info* are the model source and interface as returned
339    from :func:`generate.make_source` and :func:`generate.make_model_info`.
340
341    *dtype* is the desired model precision.  Any numpy dtype for single
342    or double precision floats will do, such as 'f', 'float32' or 'single'
343    for single and 'd', 'float64' or 'double' for double.  Double precision
344    is an optional extension which may not be available on all devices.
345    Half precision ('float16','half') may be available on some devices.
346    Fast precision ('fast') is a loose version of single precision, indicating
347    that the compiler is allowed to take shortcuts.
348    """
349    def __init__(self, source, model_info, dtype=generate.F32, fast=False):
350        # type: (str, ModelInfo, np.dtype, bool) -> None
351        self.info = model_info
352        self.source = source
353        self.dtype = dtype
354        self.fast = fast
355        self.program = None # delay program creation
356
357    def __getstate__(self):
358        # type: () -> Tuple[ModelInfo, str, np.dtype, bool]
359        return self.info, self.source, self.dtype, self.fast
360
361    def __setstate__(self, state):
362        # type: (Tuple[ModelInfo, str, np.dtype, bool]) -> None
363        self.info, self.source, self.dtype, self.fast = state
364        self.program = None
365
366    def make_kernel(self, q_vectors):
367        # type: (List[np.ndarray]) -> "GpuKernel"
368        if self.program is None:
369            compiler = environment().compile_program
370            self.program = compiler(self.info.name, self.source,
371                                    self.dtype, self.fast)
372        is_2d = len(q_vectors) == 2
373        kernel_name = generate.kernel_name(self.info, is_2d)
374        kernel = getattr(self.program, kernel_name)
375        return GpuKernel(kernel, self.dtype, self.info, q_vectors)
376
377    def release(self):
378        # type: () -> None
379        """
380        Free the resources associated with the model.
381        """
382        if self.program is not None:
383            environment().release_program(self.info.name)
384            self.program = None
385
386    def __del__(self):
387        # type: () -> None
388        self.release()
389
390# TODO: check that we don't need a destructor for buffers which go out of scope
391class GpuInput(object):
392    """
393    Make q data available to the gpu.
394
395    *q_vectors* is a list of q vectors, which will be *[q]* for 1-D data,
396    and *[qx, qy]* for 2-D data.  Internally, the vectors will be reallocated
397    to get the best performance on OpenCL, which may involve shifting and
398    stretching the array to better match the memory architecture.  Additional
399    points will be evaluated with *q=1e-3*.
400
401    *dtype* is the data type for the q vectors. The data type should be
402    set to match that of the kernel, which is an attribute of
403    :class:`GpuProgram`.  Note that not all kernels support double
404    precision, so even if the program was created for double precision,
405    the *GpuProgram.dtype* may be single precision.
406
407    Call :meth:`release` when complete.  Even if not called directly, the
408    buffer will be released when the data object is freed.
409    """
410    def __init__(self, q_vectors, dtype=generate.F32):
411        # type: (List[np.ndarray], np.dtype) -> None
412        # TODO: do we ever need double precision q?
413        env = environment()
414        self.nq = q_vectors[0].size
415        self.dtype = np.dtype(dtype)
416        self.is_2d = (len(q_vectors) == 2)
417        # TODO: stretch input based on get_warp()
418        # not doing it now since warp depends on kernel, which is not known
419        # at this point, so instead using 32, which is good on the set of
420        # architectures tested so far.
421        if self.is_2d:
422            # Note: 17 rather than 15 because results is 2 elements
423            # longer than input.
424            width = ((self.nq+17)//16)*16
425            self.q = np.empty((width, 2), dtype=dtype)
426            self.q[:self.nq, 0] = q_vectors[0]
427            self.q[:self.nq, 1] = q_vectors[1]
428        else:
429            # Note: 33 rather than 31 because results is 2 elements
430            # longer than input.
431            width = ((self.nq+33)//32)*32
432            self.q = np.empty(width, dtype=dtype)
433            self.q[:self.nq] = q_vectors[0]
434        self.global_size = [self.q.shape[0]]
435        context = env.get_context(self.dtype)
436        #print("creating inputs of size", self.global_size)
437        self.q_b = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR,
438                             hostbuf=self.q)
439
440    def release(self):
441        # type: () -> None
442        """
443        Free the memory.
444        """
445        if self.q_b is not None:
446            self.q_b.release()
447            self.q_b = None
448
449    def __del__(self):
450        # type: () -> None
451        self.release()
452
453class GpuKernel(Kernel):
454    """
455    Callable SAS kernel.
456
457    *kernel* is the GpuKernel object to call
458
459    *model_info* is the module information
460
461    *q_vectors* is the q vectors at which the kernel should be evaluated
462
463    *dtype* is the kernel precision
464
465    The resulting call method takes the *pars*, a list of values for
466    the fixed parameters to the kernel, and *pd_pars*, a list of (value,weight)
467    vectors for the polydisperse parameters.  *cutoff* determines the
468    integration limits: any points with combined weight less than *cutoff*
469    will not be calculated.
470
471    Call :meth:`release` when done with the kernel instance.
472    """
473    def __init__(self, kernel, dtype, model_info, q_vectors):
474        # type: (cl.Kernel, np.dtype, ModelInfo, List[np.ndarray]) -> None
475        max_pd = model_info.parameters.max_pd
476        npars = len(model_info.parameters.kernel_parameters)-2
477        q_input = GpuInput(q_vectors, dtype)
478        self.kernel = kernel
479        self.info = model_info
480        self.dtype = dtype
481        self.dim = '2d' if q_input.is_2d else '1d'
482        # plus three for the normalization values
483        self.result = np.empty(q_input.nq+3, dtype)
484
485        # Inputs and outputs for each kernel call
486        # Note: res may be shorter than res_b if global_size != nq
487        env = environment()
488        self.queue = env.get_queue(dtype)
489
490        # details is int32 data, padded to an 8 integer boundary
491        size = ((max_pd*5 + npars*3 + 2 + 7)//8)*8
492        self.result_b = cl.Buffer(self.queue.context, mf.READ_WRITE,
493                               q_input.global_size[0] * dtype.itemsize)
494        self.q_input = q_input # allocated by GpuInput above
495
496        self._need_release = [ self.result_b, self.q_input ]
497        self.real = (np.float32 if dtype == generate.F32
498                     else np.float64 if dtype == generate.F64
499                     else np.float16 if dtype == generate.F16
500                     else np.float32)  # will never get here, so use np.float32
501
502    def __call__(self, call_details, weights, values, cutoff):
503        # type: (CallDetails, np.ndarray, np.ndarray, float) -> np.ndarray
504        context = self.queue.context
505        # Arrange data transfer to card
506        details_b = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR,
507                              hostbuf=call_details.buffer)
508        weights_b = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR,
509                              hostbuf=weights) if len(weights) else None
510        values_b = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR,
511                             hostbuf=values)
512
513        # Call kernel and retrieve results
514        step = 100
515        for start in range(0, call_details.total_pd, step):
516            stop = min(start+step, call_details.total_pd)
517            args = [
518                np.uint32(self.q_input.nq), np.int32(start), np.int32(stop),
519                details_b, weights_b, values_b, self.q_input.q_b, self.result_b,
520                self.real(cutoff),
521            ]
522            self.kernel(self.queue, self.q_input.global_size, None, *args)
523        cl.enqueue_copy(self.queue, self.result, self.result_b)
524
525        # Free buffers
526        for v in (details_b, weights_b, values_b):
527            if v is not None: v.release()
528
529        return self.result[:self.q_input.nq]
530
531    def release(self):
532        # type: () -> None
533        """
534        Release resources associated with the kernel.
535        """
536        for v in self._need_release:
537            v.release()
538        self._need_release = []
539
540    def __del__(self):
541        # type: () -> None
542        self.release()
Note: See TracBrowser for help on using the repository browser.