source: sasmodels/sasmodels/kernelcl.py @ a34b811

ticket-1257-vesicle-productticket_1156ticket_822_more_unit_tests
Last change on this file since a34b811 was a34b811, checked in by Paul Kienzle <pkienzle@…>, 6 years ago

use radius_effective/radius_effective_mode/radius_effective_modes consistently throughout the code

  • Property mode set to 100644
File size: 23.9 KB
Line 
1"""
2GPU driver for C kernels
3
4TODO: docs are out of date
5
6There should be a single GPU environment running on the system.  This
7environment is constructed on the first call to :func:`env`, and the
8same environment is returned on each call.
9
10After retrieving the environment, the next step is to create the kernel.
11This is done with a call to :meth:`GpuEnvironment.make_kernel`, which
12returns the type of data used by the kernel.
13
14Next a :class:`GpuData` object should be created with the correct kind
15of data.  This data object can be used by multiple kernels, for example,
16if the target model is a weighted sum of multiple kernels.  The data
17should include any extra evaluation points required to compute the proper
18data smearing.  This need not match the square grid for 2D data if there
19is an index saying which q points are active.
20
21Together the GpuData, the program, and a device form a :class:`GpuKernel`.
22This kernel is used during fitting, receiving new sets of parameters and
23evaluating them.  The output value is stored in an output buffer on the
24devices, where it can be combined with other structure factors and form
25factors and have instrumental resolution effects applied.
26
27In order to use OpenCL for your models, you will need OpenCL drivers for
28your machine.  These should be available from your graphics card vendor.
29Intel provides OpenCL drivers for CPUs as well as their integrated HD
30graphics chipsets.  AMD also provides drivers for Intel CPUs, but as of
31this writing the performance is lacking compared to the Intel drivers.
32NVidia combines drivers for CUDA and OpenCL in one package.  The result
33is a bit messy if you have multiple drivers installed.  You can see which
34drivers are available by starting python and running:
35
36    import pyopencl as cl
37    cl.create_some_context(interactive=True)
38
39Once you have done that, it will show the available drivers which you
40can select.  It will then tell you that you can use these drivers
41automatically by setting the SAS_OPENCL environment variable, which is
42PYOPENCL_CTX equivalent but not conflicting with other pyopnecl programs.
43
44Some graphics cards have multiple devices on the same card.  You cannot
45yet use both of them concurrently to evaluate models, but you can run
46the program twice using a different device for each session.
47
48OpenCL kernels are compiled when needed by the device driver.  Some
49drivers produce compiler output even when there is no error.  You
50can see the output by setting PYOPENCL_COMPILER_OUTPUT=1.  It should be
51harmless, albeit annoying.
52"""
53from __future__ import print_function
54
55import sys
56import os
57import warnings
58import logging
59import time
60
61try:
62    from time import perf_counter as clock
63except ImportError: # CRUFT: python < 3.3
64    if sys.platform.count("darwin") > 0:
65        from time import time as clock
66    else:
67        from time import clock
68
69import numpy as np  # type: ignore
70
71# Attempt to setup OpenCL. This may fail if the pyopencl package is not
72# installed or if it is installed but there are no devices available.
73try:
74    import pyopencl as cl  # type: ignore
75    from pyopencl import mem_flags as mf
76    from pyopencl.characterize import get_fast_inaccurate_build_options
77    # Ask OpenCL for the default context so that we know that one exists.
78    cl.create_some_context(interactive=False)
79    HAVE_OPENCL = True
80    OPENCL_ERROR = ""
81except Exception as exc:
82    HAVE_OPENCL = False
83    OPENCL_ERROR = str(exc)
84
85from . import generate
86from .generate import F32, F64
87from .kernel import KernelModel, Kernel
88
89# pylint: disable=unused-import
90try:
91    from typing import Tuple, Callable, Any
92    from .modelinfo import ModelInfo
93    from .details import CallDetails
94except ImportError:
95    pass
96# pylint: enable=unused-import
97
98
99# CRUFT: pyopencl < 2017.1 (as of June 2016 needs quotes around include path).
100def quote_path(v):
101    # type: (str) -> str
102    """
103    Quote the path if it is not already quoted.
104
105    If v starts with '-', then assume that it is a -I option or similar
106    and do not quote it.  This is fragile:  -Ipath with space needs to
107    be quoted.
108    """
109    return '"'+v+'"' if v and ' ' in v and not v[0] in "\"'-" else v
110
111
112def fix_pyopencl_include():
113    # type: (None) -> None
114    """
115    Monkey patch pyopencl to allow spaces in include file path.
116    """
117    # pylint: disable=protected-access
118    import pyopencl
119    if hasattr(pyopencl, '_DEFAULT_INCLUDE_OPTIONS'):
120        pyopencl._DEFAULT_INCLUDE_OPTIONS = [
121            quote_path(v) for v in pyopencl._DEFAULT_INCLUDE_OPTIONS
122            ]
123
124
125if HAVE_OPENCL:
126    fix_pyopencl_include()
127
128# The max loops number is limited by the amount of local memory available
129# on the device.  You don't want to make this value too big because it will
130# waste resources, nor too small because it may interfere with users trying
131# to do their polydispersity calculations.  A value of 1024 should be much
132# larger than necessary given that cost grows as npts^k where k is the number
133# of polydisperse parameters.
134MAX_LOOPS = 2048
135
136# Pragmas for enable OpenCL features.  Be sure to protect them so that they
137# still compile even if OpenCL is not present.
138_F16_PRAGMA = """\
139#if defined(__OPENCL_VERSION__) // && !defined(cl_khr_fp16)
140#  pragma OPENCL EXTENSION cl_khr_fp16: enable
141#endif
142"""
143
144_F64_PRAGMA = """\
145#if defined(__OPENCL_VERSION__) // && !defined(cl_khr_fp64)
146#  pragma OPENCL EXTENSION cl_khr_fp64: enable
147#endif
148"""
149
150
151def use_opencl():
152    # type: () -> bool
153    """Return True if OpenCL is the default computational engine"""
154    sas_opencl = os.environ.get("SAS_OPENCL", "OpenCL").lower()
155    return HAVE_OPENCL and sas_opencl != "none" and not sas_opencl.startswith("cuda")
156
157
158ENV = None
159def reset_environment():
160    # type: () -> None
161    """
162    Call to create a new OpenCL context, such as after a change to SAS_OPENCL.
163    """
164    global ENV
165    ENV = GpuEnvironment() if use_opencl() else None
166
167
168def environment():
169    # type: () -> "GpuEnvironment"
170    """
171    Returns a singleton :class:`GpuEnvironment`.
172
173    This provides an OpenCL context and one queue per device.
174    """
175    if ENV is None:
176        if not HAVE_OPENCL:
177            raise RuntimeError("OpenCL startup failed with ***"
178                               + OPENCL_ERROR + "***; using C compiler instead")
179        reset_environment()
180        if ENV is None:
181            raise RuntimeError("SAS_OPENCL=None in environment")
182    return ENV
183
184
185def has_type(device, dtype):
186    # type: (cl.Device, np.dtype) -> bool
187    """
188    Return true if device supports the requested precision.
189    """
190    if dtype == F32:
191        return True
192    elif dtype == F64:
193        return "cl_khr_fp64" in device.extensions
194    else:
195        # Not supporting F16 type since it isn't accurate enough.
196        return False
197
198
199def get_warp(kernel, queue):
200    # type: (cl.Kernel, cl.CommandQueue) -> int
201    """
202    Return the size of an execution batch for *kernel* running on *queue*.
203    """
204    return kernel.get_work_group_info(
205        cl.kernel_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
206        queue.device)
207
208
209def compile_model(context, source, dtype, fast=False):
210    # type: (cl.Context, str, np.dtype, bool) -> cl.Program
211    """
212    Build a model to run on the gpu.
213
214    Returns the compiled program and its type.
215
216    Raises an error if the desired precision is not available.
217    """
218    dtype = np.dtype(dtype)
219    if not all(has_type(d, dtype) for d in context.devices):
220        raise RuntimeError("%s not supported for devices"%dtype)
221
222    source_list = [generate.convert_type(source, dtype)]
223
224    if dtype == generate.F16:
225        source_list.insert(0, _F16_PRAGMA)
226    elif dtype == generate.F64:
227        source_list.insert(0, _F64_PRAGMA)
228
229    # Note: USE_SINCOS makes the Intel CPU slower under OpenCL.
230    if context.devices[0].type == cl.device_type.GPU:
231        source_list.insert(0, "#define USE_SINCOS\n")
232    options = (get_fast_inaccurate_build_options(context.devices[0])
233               if fast else [])
234    source = "\n".join(source_list)
235    program = cl.Program(context, source).build(options=options)
236
237    #print("done with "+program)
238    return program
239
240
241# For now, this returns one device in the context.
242# TODO: Create a context that contains all devices on all platforms.
243class GpuEnvironment(object):
244    """
245    GPU context for OpenCL, with possibly many devices and one queue per device.
246    """
247    def __init__(self):
248        # type: () -> None
249        # Find gpu context.
250        context_list = _create_some_context()
251
252        # Find a context for F32 and for F64 (maybe the same one).
253        # F16 isn't good enough.
254        self.context = {}
255        for dtype in (F32, F64):
256            for context in context_list:
257                if has_type(context.devices[0], dtype):
258                    self.context[dtype] = context
259                    break
260            else:
261                self.context[dtype] = None
262
263        # Build a queue for each context.
264        self.queue = {}
265        context = self.context[F32]
266        self.queue[F32] = cl.CommandQueue(context, context.devices[0])
267        if self.context[F64] == self.context[F32]:
268            self.queue[F64] = self.queue[F32]
269        else:
270            context = self.context[F64]
271            self.queue[F64] = cl.CommandQueue(context, context.devices[0])
272
273        ## Byte boundary for data alignment.
274        #self.data_boundary = max(context.devices[0].min_data_type_align_size
275        #                         for context in self.context.values())
276
277        # Cache for compiled programs, and for items in context.
278        self.compiled = {}
279
280    def has_type(self, dtype):
281        # type: (np.dtype) -> bool
282        """
283        Return True if all devices support a given type.
284        """
285        return self.context.get(dtype, None) is not None
286
287    def compile_program(self, name, source, dtype, fast, timestamp):
288        # type: (str, str, np.dtype, bool, float) -> cl.Program
289        """
290        Compile the program for the device in the given context.
291        """
292        # Note: PyOpenCL caches based on md5 hash of source, options and device
293        # but I'll do so as well just to save some data munging time.
294        tag = generate.tag_source(source)
295        key = "%s-%s-%s%s"%(name, dtype, tag, ("-fast" if fast else ""))
296        # Check timestamp on program.
297        program, program_timestamp = self.compiled.get(key, (None, np.inf))
298        if program_timestamp < timestamp:
299            del self.compiled[key]
300        if key not in self.compiled:
301            context = self.context[dtype]
302            logging.info("building %s for OpenCL %s", key,
303                         context.devices[0].name.strip())
304            program = compile_model(self.context[dtype],
305                                    str(source), dtype, fast)
306            self.compiled[key] = (program, timestamp)
307        return program
308
309
310def _create_some_context():
311    # type: () -> cl.Context
312    """
313    Protected call to cl.create_some_context without interactivity.
314
315    Uses SAS_OPENCL or PYOPENCL_CTX if they are set in the environment,
316    otherwise scans for the most appropriate device using
317    :func:`_get_default_context`.  Ignore *SAS_OPENCL=OpenCL*, which
318    indicates that an OpenCL device should be used without specifying
319    which one (and not a CUDA device, or no GPU).
320    """
321    # Assume we do not get here if SAS_OPENCL is None or CUDA.
322    sas_opencl = os.environ.get('SAS_OPENCL', 'opencl')
323    if sas_opencl.lower() != 'opencl':
324        # Setting PYOPENCL_CTX as a SAS_OPENCL to create cl context.
325        os.environ["PYOPENCL_CTX"] = sas_opencl
326
327    if 'PYOPENCL_CTX' in os.environ:
328        try:
329            return [cl.create_some_context(interactive=False)]
330        except Exception as exc:
331            # TODO: Should warnings instead be put into logging.warn?
332            warnings.warn(str(exc))
333            warnings.warn(
334                "pyopencl.create_some_context() failed.  The environment "
335                "variable 'SAS_OPENCL' or 'PYOPENCL_CTX' might not be set "
336                "correctly")
337
338    return _get_default_context()
339
340
341def _get_default_context():
342    # type: () -> List[cl.Context]
343    """
344    Get an OpenCL context, preferring GPU over CPU, and preferring Intel
345    drivers over AMD drivers.
346    """
347    # Note: on mobile devices there is automatic clock scaling if either the
348    # CPU or the GPU is underutilized; probably doesn't affect us, but we if
349    # it did, it would mean that putting a busy loop on the CPU while the GPU
350    # is running may increase throughput.
351    #
352    # MacBook Pro, base install:
353    #     {'Apple': [Intel CPU, NVIDIA GPU]}
354    # MacBook Pro, base install:
355    #     {'Apple': [Intel CPU, Intel GPU]}
356    # 2 x NVIDIA 295 with Intel and NVIDIA opencl drivers install:
357    #     {'Intel': [CPU], 'NVIDIA': [GPU, GPU, GPU, GPU]}
358    gpu, cpu = None, None
359    for platform in cl.get_platforms():
360        # AMD provides a much weaker CPU driver than Intel/Apple, so avoid it.
361        # If someone has bothered to install the AMD/NVIDIA drivers, prefer
362        # them over the integrated graphics driver that may have been supplied
363        # with the CPU chipset.
364        preferred_cpu = (platform.vendor.startswith('Intel')
365                         or platform.vendor.startswith('Apple'))
366        preferred_gpu = (platform.vendor.startswith('Advanced')
367                         or platform.vendor.startswith('NVIDIA'))
368        for device in platform.get_devices():
369            if device.type == cl.device_type.GPU:
370                # If the existing type is not GPU then it will be CUSTOM
371                # or ACCELERATOR so don't override it.
372                if gpu is None or (preferred_gpu and gpu.type == cl.device_type.GPU):
373                    gpu = device
374            elif device.type == cl.device_type.CPU:
375                if cpu is None or preferred_cpu:
376                    cpu = device
377            else:
378                # System has cl.device_type.ACCELERATOR or cl.device_type.CUSTOM
379                # Intel Phi for example registers as an accelerator.
380                # Since the user installed a custom device on their system
381                # and went through the pain of sorting out OpenCL drivers for
382                # it, lets assume they really do want to use it as their
383                # primary compute device.
384                gpu = device
385
386    # Order the devices by gpu then by cpu; when searching for an available
387    # device by data type they will be checked in this order, which means
388    # that if the gpu supports double then the cpu will never be used (though
389    # we may make it possible to explicitly request the cpu at some point).
390    devices = []
391    if gpu is not None:
392        devices.append(gpu)
393    if cpu is not None:
394        devices.append(cpu)
395    return [cl.Context([d]) for d in devices]
396
397
398class GpuModel(KernelModel):
399    """
400    GPU wrapper for a single model.
401
402    *source* and *model_info* are the model source and interface as returned
403    from :func:`generate.make_source` and :func:`generate.make_model_info`.
404
405    *dtype* is the desired model precision.  Any numpy dtype for single
406    or double precision floats will do, such as 'f', 'float32' or 'single'
407    for single and 'd', 'float64' or 'double' for double.  Double precision
408    is an optional extension which may not be available on all devices.
409    Half precision ('float16','half') may be available on some devices.
410    Fast precision ('fast') is a loose version of single precision, indicating
411    that the compiler is allowed to take shortcuts.
412    """
413    info = None  # type: ModelInfo
414    source = ""  # type: str
415    dtype = None  # type: np.dtype
416    fast = False  # type: bool
417    _program = None  # type: cl.Program
418    _kernels = None  # type: Dict[str, cl.Kernel]
419
420    def __init__(self, source, model_info, dtype=generate.F32, fast=False):
421        # type: (Dict[str,str], ModelInfo, np.dtype, bool) -> None
422        self.info = model_info
423        self.source = source
424        self.dtype = dtype
425        self.fast = fast
426
427    def __getstate__(self):
428        # type: () -> Tuple[ModelInfo, str, np.dtype, bool]
429        return self.info, self.source, self.dtype, self.fast
430
431    def __setstate__(self, state):
432        # type: (Tuple[ModelInfo, str, np.dtype, bool]) -> None
433        self.info, self.source, self.dtype, self.fast = state
434        self._program = self._kernels = None
435
436    def make_kernel(self, q_vectors):
437        # type: (List[np.ndarray]) -> "GpuKernel"
438        return GpuKernel(self, q_vectors)
439
440    def get_function(self, name):
441        # type: (str) -> cl.Kernel
442        """
443        Fetch the kernel from the environment by name, compiling it if it
444        does not already exist.
445        """
446        if self._program is None:
447            self._prepare_program()
448        return self._kernels[name]
449
450    def _prepare_program(self):
451        # type: (str) -> None
452        env = environment()
453        timestamp = generate.ocl_timestamp(self.info)
454        program = env.compile_program(
455            self.info.name,
456            self.source['opencl'],
457            self.dtype,
458            self.fast,
459            timestamp)
460        variants = ['Iq', 'Iqxy', 'Imagnetic']
461        names = [generate.kernel_name(self.info, k) for k in variants]
462        functions = [getattr(program, k) for k in names]
463        self._kernels = {k: v for k, v in zip(variants, functions)}
464        # Keep a handle to program so GC doesn't collect.
465        self._program = program
466
467
468# TODO: Check that we don't need a destructor for buffers which go out of scope.
469class GpuInput(object):
470    """
471    Make q data available to the gpu.
472
473    *q_vectors* is a list of q vectors, which will be *[q]* for 1-D data,
474    and *[qx, qy]* for 2-D data.  Internally, the vectors will be reallocated
475    to get the best performance on OpenCL, which may involve shifting and
476    stretching the array to better match the memory architecture.  Additional
477    points will be evaluated with *q=1e-3*.
478
479    *dtype* is the data type for the q vectors. The data type should be
480    set to match that of the kernel, which is an attribute of
481    :class:`GpuProgram`.  Note that not all kernels support double
482    precision, so even if the program was created for double precision,
483    the *GpuProgram.dtype* may be single precision.
484
485    Call :meth:`release` when complete.  Even if not called directly, the
486    buffer will be released when the data object is freed.
487    """
488    def __init__(self, q_vectors, dtype=generate.F32):
489        # type: (List[np.ndarray], np.dtype) -> None
490        # TODO: Do we ever need double precision q?
491        self.nq = q_vectors[0].size
492        self.dtype = np.dtype(dtype)
493        self.is_2d = (len(q_vectors) == 2)
494        # TODO: Stretch input based on get_warp().
495        # Not doing it now since warp depends on kernel, which is not known
496        # at this point, so instead using 32, which is good on the set of
497        # architectures tested so far.
498        if self.is_2d:
499            width = ((self.nq+15)//16)*16
500            self.q = np.empty((width, 2), dtype=dtype)
501            self.q[:self.nq, 0] = q_vectors[0]
502            self.q[:self.nq, 1] = q_vectors[1]
503        else:
504            width = ((self.nq+31)//32)*32
505            self.q = np.empty(width, dtype=dtype)
506            self.q[:self.nq] = q_vectors[0]
507        self.global_size = [self.q.shape[0]]
508        #print("creating inputs of size", self.global_size)
509
510        # Transfer input value to GPU.
511        env = environment()
512        context = env.context[self.dtype]
513        self.q_b = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR,
514                             hostbuf=self.q)
515
516    def release(self):
517        # type: () -> None
518        """
519        Free the buffer associated with the q value.
520        """
521        if self.q_b is not None:
522            self.q_b.release()
523            self.q_b = None
524
525    def __del__(self):
526        # type: () -> None
527        self.release()
528
529
530class GpuKernel(Kernel):
531    """
532    Callable SAS kernel.
533
534    *model* is the GpuModel object to call
535
536    The kernel is derived from :class:`Kernel`, providing the
537    :meth:`call_kernel` method to evaluate the kernel for a given set of
538    parameters.  Because of the need to move the q values to the GPU before
539    evaluation, the kernel is instantiated for a particular set of q vectors,
540    and can be called many times without transfering q each time.
541
542    Call :meth:`release` when done with the kernel instance.
543    """
544    #: SAS model information structure.
545    info = None  # type: ModelInfo
546    #: Kernel precision.
547    dtype = None  # type: np.dtype
548    #: Kernel dimensions (1d or 2d).
549    dim = ""  # type: str
550    #: Calculation results, updated after each call to :meth:`_call_kernel`.
551    result = None  # type: np.ndarray
552
553    def __init__(self, model, q_vectors):
554        # type: (GpuModel, List[np.ndarray]) -> None
555        dtype = model.dtype
556        self.q_input = GpuInput(q_vectors, dtype)
557        self._model = model
558
559        # Attributes accessed from the outside.
560        self.dim = '2d' if self.q_input.is_2d else '1d'
561        self.info = model.info
562        self.dtype = dtype
563
564        # Converter to translate input to target type.
565        self._as_dtype = np.float64 if dtype == generate.F64 else np.float32
566
567        # Holding place for the returned value.
568        nout = 2 if self.info.have_Fq and self.dim == '1d' else 1
569        extra_q = 4  # Total weight, form volume, shell volume and R_eff.
570        self.result = np.empty(self.q_input.nq*nout + extra_q, dtype)
571
572        # Allocate result value on GPU.
573        env = environment()
574        context = env.context[self.dtype]
575        width = ((self.result.size+31)//32)*32 * self.dtype.itemsize
576        self._result_b = cl.Buffer(context, mf.READ_WRITE, width)
577
578    def _call_kernel(self, call_details, values, cutoff, magnetic,
579                     radius_effective_mode):
580        # type: (CallDetails, np.ndarray, float, bool, int) -> np.ndarray
581        env = environment()
582        queue = env.queue[self._model.dtype]
583        context = queue.context
584
585        # Arrange data transfer to card.
586        details_b = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR,
587                              hostbuf=call_details.buffer)
588        values_b = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR,
589                             hostbuf=values)
590
591        # Setup kernel function and arguments.
592        name = 'Iq' if self.dim == '1d' else 'Imagnetic' if magnetic else 'Iqxy'
593        kernel = self._model.get_function(name)
594        kernel_args = [
595            np.uint32(self.q_input.nq),  # Number of inputs.
596            None,  # Placeholder for pd_start.
597            None,  # Placeholder for pd_stop.
598            details_b,  # Problem definition.
599            values_b,  # Parameter values.
600            self.q_input.q_b,  # Q values.
601            self._result_b,   # Result storage.
602            self._as_dtype(cutoff),  # Probability cutoff.
603            np.uint32(radius_effective_mode),  # R_eff mode.
604        ]
605
606        # Call kernel and retrieve results.
607        #print("Calling OpenCL")
608        #call_details.show(values)
609        wait_for = None
610        last_nap = clock()
611        step = 1000000//self.q_input.nq + 1
612        for start in range(0, call_details.num_eval, step):
613            stop = min(start + step, call_details.num_eval)
614            #print("queuing",start,stop)
615            kernel_args[1:3] = [np.int32(start), np.int32(stop)]
616            wait_for = [kernel(queue, self.q_input.global_size, None,
617                               *kernel_args, wait_for=wait_for)]
618            if stop < call_details.num_eval:
619                # Allow other processes to run.
620                wait_for[0].wait()
621                current_time = clock()
622                if current_time - last_nap > 0.5:
623                    time.sleep(0.001)
624                    last_nap = current_time
625        cl.enqueue_copy(queue, self.result, self._result_b, wait_for=wait_for)
626        #print("result", self.result)
627
628        # Free buffers.
629        details_b.release()
630        values_b.release()
631
632    def release(self):
633        # type: () -> None
634        """
635        Release resources associated with the kernel.
636        """
637        self.q_input.release()
638        if self._result_b is not None:
639            self._result_b.release()
640            self._result_b = None
641
642    def __del__(self):
643        # type: () -> None
644        self.release()
Note: See TracBrowser for help on using the repository browser.